From 35c30f3acafdaf79950013ecce88016969632c0c Mon Sep 17 00:00:00 2001 From: jichang Date: Sun, 19 Nov 2023 20:48:07 +0800 Subject: [PATCH 1/2] update HHS and BBS yaml for gf942 for NN/TN/NT square sizes --- .../aquavanjaram_Cijk_Ailk_Bjlk_BBS_BH.yaml | 1122 +++++++++++++++++ .../aquavanjaram_Cijk_Ailk_Bjlk_HHS_BH.yaml | 1122 +++++++++++++++++ .../aquavanjaram_Cijk_Ailk_Bljk_BBS_BH.yaml | 865 +++++++++++++ .../aquavanjaram_Cijk_Ailk_Bljk_HHS_BH.yaml | 703 +++++++++-- .../aquavanjaram_Cijk_Alik_Bljk_BBS_BH.yaml | 865 +++++++++++++ .../aquavanjaram_Cijk_Alik_Bljk_HHS_BH.yaml | 1122 +++++++++++++++++ 6 files changed, 5711 insertions(+), 88 deletions(-) create mode 100644 library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bjlk_BBS_BH.yaml create mode 100644 library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bjlk_HHS_BH.yaml create mode 100644 library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bljk_BBS_BH.yaml create mode 100644 library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Alik_Bljk_BBS_BH.yaml create mode 100644 library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Alik_Bljk_HHS_BH.yaml diff --git a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bjlk_BBS_BH.yaml b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bjlk_BBS_BH.yaml new file mode 100644 index 0000000000..28563e0cd6 --- /dev/null +++ b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bjlk_BBS_BH.yaml @@ -0,0 +1,1122 @@ +- {MinimumRequiredVersion: 4.33.0} +- aquavanjaram +- gfx942 +- [Device 0049, Device 0050] +- Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false +- - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x64_MI32x32x1_SN_LDSB1_GRVWB8_GSUM_LBSPPA0_LBSPPB0_LPA0_LPB0_MIWT2_2_NLCB1_SVW2_VWA2_VWB2_WG64_4_1 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x64_MI32x32x1_SN_LDSB1_GRVWB8_GSU1_LBSPPA0_LBSPPB0_LPA0_LPB0_MIWT2_2_NLCB1_SVW2_VWA2_VWB2_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA2048_LBSPPB1792_LPA16_LPB16_MIWT4_14_NLCB7_SVW4_VWA4_VWB2_WG64_4_1 + LSCA: 256 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1792 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 31232 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 14592 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 31232 + LdsOffsetMetadata_Blk: 49408 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 7 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA2048_LBSPPB1792_LPA16_LPB16_MIWT4_14_NLCB7_SVW4_VWA4_VWB2_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + TransposeLDS: 0 + TransposeLDSMetadata: true + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB8_GSUM_LBSPPA2048_LBSPPB1792_LPA16_LPB16_MIWT4_14_NLCB7_SVW4_VWA4_VWB2_WG64_4_1 + LSCA: 256 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1792 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 31232 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 14592 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 31232 + LdsOffsetMetadata_Blk: 49408 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 7 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB8_GSU1_LBSPPA2048_LBSPPB1792_LPA16_LPB16_MIWT4_14_NLCB7_SVW4_VWA4_VWB2_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + TransposeLDS: 0 + TransposeLDSMetadata: true + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x128_MI32x32x1_SN_LDSB1_GRVWB8_GSUM_LBSPPA0_LBSPPB0_LPA0_LPB0_MIWT2_2_NLCB1_SVW2_VWA2_VWB2_WG64_4_1 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 32768 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 16 + LoopUnroll: 128 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x128_MI32x32x1_SN_LDSB1_GRVWB8_GSU1_LBSPPA0_LBSPPB0_LPA0_LPB0_MIWT2_2_NLCB1_SVW2_VWA2_VWB2_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 +- [2, 3, 0, 1] +- - - [512, 512, 1, 512, 512, 512, 512, 512] + - [0, 0.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [0, 0.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [3, 0.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [2, 0.0] + - - [4096, 4096, 1, 8192, 4096, 4096, 4096, 4096] + - [1, 0.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [1, 0.0] +- null +- null +- DeviceEfficiency +- null +- Equality diff --git a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bjlk_HHS_BH.yaml b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bjlk_HHS_BH.yaml new file mode 100644 index 0000000000..697fe37283 --- /dev/null +++ b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bjlk_HHS_BH.yaml @@ -0,0 +1,1122 @@ +- {MinimumRequiredVersion: 4.33.0} +- aquavanjaram +- gfx942 +- [Device 0049, Device 0050] +- Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false +- - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x64x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA1024_LBSPPB512_LPA16_LPB16_MIWT2_4_NLCB1_SVW2_VWA2_VWB4_WG64_4_1 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 24832 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 12800 + LdsOffsetMetadata_Blk: 24832 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x64x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA1024_LBSPPB512_LPA16_LPB16_MIWT2_4_NLCB1_SVW2_VWA2_VWB4_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA2048_LBSPPB1792_LPA16_LPB16_MIWT4_14_NLCB7_SVW4_VWA4_VWB2_WG64_4_1 + LSCA: 256 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1792 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 31232 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 14592 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 31232 + LdsOffsetMetadata_Blk: 49408 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 7 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA2048_LBSPPB1792_LPA16_LPB16_MIWT4_14_NLCB7_SVW4_VWA4_VWB2_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + TransposeLDS: 0 + TransposeLDSMetadata: true + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB8_GSUM_LBSPPA2048_LBSPPB1792_LPA16_LPB16_MIWT4_14_NLCB7_SVW4_VWA4_VWB2_WG64_4_1 + LSCA: 256 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1792 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 31232 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 14592 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 31232 + LdsOffsetMetadata_Blk: 49408 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 7 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB8_GSU1_LBSPPA2048_LBSPPB1792_LPA16_LPB16_MIWT4_14_NLCB7_SVW4_VWA4_VWB2_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + TransposeLDS: 0 + TransposeLDSMetadata: true + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x128_MI32x32x1_SN_LDSB1_GRVWB8_GSUM_LBSPPA0_LBSPPB0_LPA0_LPB0_MIWT2_2_NLCB1_SVW2_VWA2_VWB2_WG64_4_1 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 32768 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 16 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x128_MI32x32x1_SN_LDSB1_GRVWB8_GSU1_LBSPPA0_LBSPPB0_LPA0_LPB0_MIWT2_2_NLCB1_SVW2_VWA2_VWB2_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 +- [2, 3, 0, 1] +- - - [512, 512, 1, 512, 512, 512, 512, 512] + - [0, 0.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 0.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [3, 0.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [1, 0.0] + - - [4096, 4096, 1, 8192, 4096, 4096, 4096, 4096] + - [2, 0.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [1, 0.0] +- null +- null +- DeviceEfficiency +- null +- Equality diff --git a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bljk_BBS_BH.yaml b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bljk_BBS_BH.yaml new file mode 100644 index 0000000000..a6244ae480 --- /dev/null +++ b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bljk_BBS_BH.yaml @@ -0,0 +1,865 @@ +- {MinimumRequiredVersion: 4.33.0} +- aquavanjaram +- gfx942 +- [Device 0049, Device 0050] +- Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false +- - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x64_MI32x32x1_SN_LDSB1_GRVWB8_GSUM_LBSPPA0_LBSPPB128_LPA0_LPB8_LRVW8_MIWT2_2_SVW2_VWA2_WG64_4_1 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 17408 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x64_MI32x32x1_SN_LDSB1_GRVWB8_GSU1_LBSPPA0_LBSPPB128_LPA0_LPB8_LRVW8_MIWT2_2_SVW2_VWA2_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x32x64_MI16x16x1_SN_LDSB1_GRVWB8_GSUM_LBSPPA2048_LBSPPB128_LPA16_LPB16_LRVW8_MIWT2_2_SVW2_VWA2_WG64_4_1 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 10880 + LdsNumElementsAlignedA: 8320 + LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8320 + LdsOffsetB_Blk: 24704 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 10880 + LdsOffsetMetadata_Blk: 24704 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x32x64_MI16x16x1_SN_LDSB1_GRVWB8_GSU1_LBSPPA2048_LBSPPB128_LPA16_LPB16_LRVW8_MIWT2_2_SVW2_VWA2_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA2048_LBSPPB128_LPA16_LPB4_LRVW4_MIWT4_14_SVW4_VWA4_WG64_4_1 + LSCA: 256 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 31872 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 15232 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 31872 + LdsOffsetMetadata_Blk: 49408 + LdsPadA: 16 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 14 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA2048_LBSPPB128_LPA16_LPB4_LRVW4_MIWT4_14_SVW4_VWA4_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: true + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [512, 512, 1, 512, 512, 512, 512, 512] + - [1, 0.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [0, 0.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [0, 0.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [2, 0.0] + - - [4096, 4096, 1, 8192, 4096, 4096, 4096, 8192] + - [2, 0.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [2, 0.0] +- null +- null +- DeviceEfficiency +- null +- Equality diff --git a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bljk_HHS_BH.yaml b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bljk_HHS_BH.yaml index efc0225684..cee421c358 100644 --- a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bljk_HHS_BH.yaml +++ b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bljk_HHS_BH.yaml @@ -14884,10 +14884,521 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x144x64_MI16x16x1_SN_LDSB1_GRVWB2_GSU1_LWPMn1_MIAV0_MIWT4_9_NEPBS0_SUS256_SPO0_SSO0_WSGRB1_WGM1 + LSCA: 256 + LSCB: 64 + LSPA: 4 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 28032 + LdsNumElementsAlignedA: 16512 + LdsNumElementsAlignedB: 11520 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16512 + LdsOffsetB_Blk: 49280 + LdsOffsetMetadata: 28032 + LdsOffsetMetadata_Blk: 49280 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 9] + MIWaveTileA: 4 + MIWaveTileB: 9 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 144 + MacroTileA: 256 + MacroTileB: 144 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 8 + NumLoadsB: 18 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 18 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x144x64_MI16x16x1_SN_LDSB1_GRVWB2_GSU1_LWPMn1_MIAV0_MIWT4_9_NEPBS0_SUS256_SPO0_SSO0_WSGRB1_WGM1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 9 + ThreadTileA: 16 + ThreadTileB: 9 + TransposeLDS: 1 + TransposeLDSMetadata: true + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 2 + WaveSeparateGlobalReadB: 1 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSUM_LBSPPA2048_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SVW2_VWA2_WG64_4_1 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 25856 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25856 + LdsOffsetMetadata_Blk: 49408 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA2048_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SVW2_VWA2_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14896,65 +15407,68 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x144x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LWPMn1_MIAV0_MIWT4_9_NEPBS0_SUS256_SPO0_SSO2_WSGRB0_WGM1 - LSCA: 256 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x64_MI32x32x1_SN_LDSB1_GRVWA8_GRVWB8_GSUM_LBSPPA0_LBSPPB128_LPA0_LPB8_LRVW8_MIWT2_2_SVW2_VWA2_WG64_4_1 + LSCA: 128 LSCB: 64 LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSPB: 4 + LVCA: 16 + LVCB: 8 LVPA: 1 - LVPB: 4 - LdsBlockSizePerPadA: 4096 + LVPB: 1 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 28032 - LdsNumElementsAlignedA: 16512 - LdsNumElementsAlignedB: 11520 + LdsNumElements: 17408 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16512 - LdsOffsetB_Blk: 49280 - LdsOffsetMetadata: 28032 - LdsOffsetMetadata_Blk: 49280 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 + LoopIters: 8 LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 16, 1, 1, 1] + MIBlock: [32, 32, 8, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 9] - MIWaveTileA: 4 - MIWaveTileB: 9 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 144 - MacroTileA: 256 - MacroTileB: 144 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 @@ -14965,18 +15479,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 8 - NumLoadsB: 9 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 9 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -15062,26 +15576,26 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 62 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x144x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LWPMn1_MIAV0_MIWT4_9_NEPBS0_SUS256_SPO0_SSO2_WSGRB0_WGM1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x64_MI32x32x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA0_LBSPPB128_LPA0_LPB8_LRVW8_MIWT2_2_SVW2_VWA2_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSyncOpt: 2 - StoreVectorWidth: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 9 - ThreadTileA: 16 - ThreadTileB: 9 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -15092,10 +15606,10 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 WorkGroup: [64, 4, 1] @@ -15117,7 +15631,7 @@ ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -15138,7 +15652,7 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 4 @@ -15150,33 +15664,36 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x144x64_MI16x16x1_SN_LDSB1_GRVWB2_GSU1_LWPMn1_MIAV0_MIWT4_9_NEPBS0_SUS256_SPO0_SSO0_WSGRB1_WGM1 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB4_GSUM_LBSPPA2048_LBSPPB128_LPA16_LPB4_LRVW4_MIWT4_14_SVW4_VWA4_WG64_4_1 LSCA: 256 LSCB: 64 - LSPA: 4 - LSPB: 2 + LSPA: 2 + LSPB: 4 LVCA: 32 - LVCB: 32 + LVCB: 16 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 28032 - LdsNumElementsAlignedA: 16512 - LdsNumElementsAlignedB: 11520 + LdsNumElements: 31872 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 15232 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16512 - LdsOffsetB_Blk: 49280 - LdsOffsetMetadata: 28032 - LdsOffsetMetadata_Blk: 49280 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 31872 + LdsOffsetMetadata_Blk: 49408 LdsPadA: 16 - LdsPadB: 16 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -15193,14 +15710,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [4, 9] + MIWaveTile: [4, 14] MIWaveTileA: 4 - MIWaveTileB: 9 + MIWaveTileB: 14 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 144 + MacroTile1: 224 MacroTileA: 256 - MacroTileB: 144 + MacroTileB: 224 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -15219,18 +15736,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 36 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 NumLoadsA: 8 - NumLoadsB: 18 + NumLoadsB: 14 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 18 + NumLoadsPerpendicularB: 14 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -15316,11 +15833,11 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 63 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x144x64_MI16x16x1_SN_LDSB1_GRVWB2_GSU1_LWPMn1_MIAV0_MIWT4_9_NEPBS0_SUS256_SPO0_SSO0_WSGRB1_WGM1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB4_GSU1_LBSPPA2048_LBSPPB128_LPA16_LPB4_LRVW4_MIWT4_14_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 @@ -15333,9 +15850,9 @@ SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 9 + ThreadTile1: 14 ThreadTileA: 16 - ThreadTileB: 9 + ThreadTileB: 14 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -15348,8 +15865,8 @@ VectorStore: -1 VectorWidthA: 4 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 WorkGroup: [64, 4, 1] @@ -15498,9 +16015,19 @@ - - [8192, 8192, 1, 1024] - [56, 0.0] - - [8192, 8192, 1, 8192] - - [62, 0.0] + - [65, 0.0] - - [8192, 8192, 1, 65536] + - [62, 0.0] + - - [512, 512, 1, 512] - [63, 0.0] + - - [1024, 1024, 1, 1024] + - [64, 0.0] + - - [2048, 2048, 1, 2048] + - [64, 0.0] + - - [4096, 4096, 1, 4096] + - [65, 0.0] + - - [4096, 4096, 1, 8192] + - [65, 0.0] - null - null - DeviceEfficiency diff --git a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Alik_Bljk_BBS_BH.yaml b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Alik_Bljk_BBS_BH.yaml new file mode 100644 index 0000000000..5ca453b3a1 --- /dev/null +++ b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Alik_Bljk_BBS_BH.yaml @@ -0,0 +1,865 @@ +- {MinimumRequiredVersion: 4.33.0} +- aquavanjaram +- gfx942 +- [Device 0049, Device 0050] +- Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false +- - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x64_MI32x32x1_SN_LDSB1_GRVWB8_GSUM_LBSPPA256_LBSPPB128_LPA8_LPB8_LRVW8_MIWT2_2_SVW2_VWA2_WG64_4_1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 17920 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17920 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x64_MI32x32x1_SN_LDSB1_GRVWB8_GSU1_LBSPPA256_LBSPPB128_LPA8_LPB8_LRVW8_MIWT2_2_SVW2_VWA2_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x64x64_MI16x16x1_SN_LDSB1_GRVWB8_GSUM_LBSPPA256_LBSPPB128_LPA16_LPB16_LRVW8_MIWT2_4_SVW2_VWA2_WG64_4_1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 14336 + LdsOffsetMetadata_Blk: 25600 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x64x64_MI16x16x1_SN_LDSB1_GRVWB8_GSU1_LBSPPA256_LBSPPB128_LPA16_LPB16_LRVW8_MIWT2_4_SVW2_VWA2_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_BBS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA512_LBSPPB128_LPA8_LPB4_LRVW4_MIWT4_14_SVW4_VWA4_WG64_4_1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 32128 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 15232 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32128 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 8 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: true + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 14 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DataTypeA: 7 + DataTypeB: 7 + DataTypeE: 7 + DestDataType: 7 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA512_LBSPPB128_LPA8_LPB4_LRVW4_MIWT4_14_SVW4_VWA4_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: true + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [512, 512, 1, 512, 512, 512, 512, 512] + - [1, 0.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [1, 0.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [0, 0.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [2, 0.0] + - - [4096, 4096, 1, 8192, 4096, 4096, 8192, 8192] + - [2, 0.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [2, 0.0] +- null +- null +- DeviceEfficiency +- null +- Equality diff --git a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Alik_Bljk_HHS_BH.yaml b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Alik_Bljk_HHS_BH.yaml new file mode 100644 index 0000000000..aff94a2765 --- /dev/null +++ b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Alik_Bljk_HHS_BH.yaml @@ -0,0 +1,1122 @@ +- {MinimumRequiredVersion: 4.33.0} +- aquavanjaram +- gfx942 +- [Device 0049, Device 0050] +- Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false +- - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x64_MI32x32x1_SN_LDSB1_GRVWB8_GSUM_LBSPPA256_LBSPPB128_LPA8_LPB8_LRVW8_MIWT2_2_SVW2_VWA2_WG64_4_1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 17920 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17920 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 8, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x64_MI32x32x1_SN_LDSB1_GRVWB8_GSU1_LBSPPA256_LBSPPB128_LPA8_LPB8_LRVW8_MIWT2_2_SVW2_VWA2_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x64x64_MI16x16x1_SN_LDSB1_GRVWB8_GSUM_LBSPPA256_LBSPPB128_LPA16_LPB16_LRVW8_MIWT2_4_SVW2_VWA2_WG64_4_1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 14336 + LdsOffsetMetadata_Blk: 25600 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x64x64_MI16x16x1_SN_LDSB1_GRVWB8_GSU1_LBSPPA256_LBSPPB128_LPA16_LPB16_LRVW8_MIWT2_4_SVW2_VWA2_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA512_LBSPPB128_LPA8_LPB4_LRVW4_MIWT4_14_SVW4_VWA4_WG64_4_1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 32128 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 15232 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32128 + LdsOffsetMetadata_Blk: 49664 + LdsPadA: 8 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 14 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA512_LBSPPB128_LPA8_LPB4_LRVW4_MIWT4_14_SVW4_VWA4_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 + TransposeLDS: 1 + TransposeLDSMetadata: true + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + ClusterLocalRead: 1 + CodeObjectVersion: V3 + CustomKernelName: '' + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 4, 2] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x64x128_MI16x16x1_SN_LDSB1_GRVWB8_GSUM_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SVW2_VWA2_WG64_4_1 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 4 + MIInputPerThreadA: 4 + MIInputPerThreadB: 4 + MIInputPerThreadMetadata: 4 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + ProblemType: + Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Fp16AltImpl: false + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StridedBatched: true + SupportUserArgs: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: false + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: false + UseScaleAlphaVec: false + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x64x128_MI16x16x1_SN_LDSB1_GRVWB8_GSU1_LBSPPA512_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SVW2_VWA2_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 + WaveSeparateGlobalReadMetadata: 0 + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupReduction: false + WorkspaceCheck: [0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 +- [2, 3, 0, 1] +- - - [512, 512, 1, 512, 512, 512, 512, 512] + - [1, 0.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 0.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [0, 0.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [2, 0.0] + - - [4096, 4096, 1, 8192, 4096, 4096, 8192, 8192] + - [2, 0.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [2, 0.0] +- null +- null +- DeviceEfficiency +- null +- Equality From 4b7a67ebc181307215a6c859ed05db3c6e29a870 Mon Sep 17 00:00:00 2001 From: jichang Date: Mon, 20 Nov 2023 17:33:57 +0800 Subject: [PATCH 2/2] update HHS/HF8_HHS NN yaml for gfx942 --- ...quavanjaram_Cijk_Ailk_Bljk_HF8_HHS_BH.yaml | 2904 +++++++------- .../aquavanjaram_Cijk_Ailk_Bljk_HHS_BH.yaml | 3458 ++++++++--------- 2 files changed, 3033 insertions(+), 3329 deletions(-) diff --git a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bljk_HF8_HHS_BH.yaml b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bljk_HF8_HHS_BH.yaml index 0d487fd01a..e0c3d990a7 100644 --- a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bljk_HF8_HHS_BH.yaml +++ b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bljk_HF8_HHS_BH.yaml @@ -8671,7 +8671,7 @@ GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 1 GroupLoadStore: false @@ -8681,29 +8681,29 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x128_MI16x16x1_SN_GSU1_MIWT1_1_NTA2_NTB1_SU0_SUS0_WSGRA0_WSGRB1_WGM16 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT32x16x128_MI16x16x1_SN_GSUM_MIWT1_1_NT0_NTA0_NTB0_NTC0_NTD0_NTM0_SU0_SUS0_WSGRA1_WSGRB1_WGM110 + LSCA: 32 LSCB: 128 - LSPA: 32 + LSPA: 16 LSPB: 4 - LVCA: 8 + LVCA: 4 LVCB: 16 - LVPA: 4 + LVPA: 2 LVPB: 1 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 27136 - LdsNumElementsAlignedA: 8448 + LdsNumElements: 14848 + LdsNumElementsAlignedA: 4352 LdsNumElementsAlignedB: 2304 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 12544 + LdsOffsetMetadata: 4352 + LdsOffsetMetadata_Blk: 12544 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 @@ -8723,14 +8723,14 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] + MIWaveGroup: [2, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MagicDivAlg: 2 MatrixInstB: 1 @@ -8746,9 +8746,9 @@ NoLdsWriteCode: false NoReject: false NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 + NonTemporal: 0 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 @@ -8757,12 +8757,12 @@ NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -8846,18 +8846,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 37 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x128_MI16x16x1_SN_GSU1_MIWT1_1_NTA2_NTB1_SU0_SUS0_WSGRA0_WSGRB1_WGM16 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT32x16x128_MI16x16x1_SN_GSU8_MIWT1_1_NT0_NTA0_NTB0_NTC0_NTD0_NTM0_SU0_SUS0_WSGRA1_WSGRB1_WGM110 SourceSwap: 0 StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 StorePriorityOpt: 1 - StoreRemapVectorWidth: 4 + StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] @@ -8877,25 +8877,25 @@ VectorStore: -1 VectorWidthA: 1 VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadA: 1 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 110 WorkGroupReduction: false - WorkspaceCheck: [0, 0] + WorkspaceCheck: [32, 0] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: null + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 0 + _WorkspaceSizePerElemC: 32 _staggerStrideShift: 0 - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -8910,7 +8910,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 16 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -8921,8 +8921,8 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 14 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 1 GroupLoadStore: false @@ -8932,65 +8932,65 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x32x16_MI32x32x1_SN_GSUM_MIWT1_1_NTA0_NTB1_SU0_SUS0_WSGRA0_WSGRB0_WGM56 - LSCA: 128 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 16 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_GSUM_MIWT1_1_NTA0_NTB0_SU0_SUS0_WSGRA1_WSGRB1_WGM1 + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 8448 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 27648 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsOffsetMetadata: 2688 - LdsOffsetMetadata_Blk: 6144 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 18944 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 18944 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 16 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 8, 1, 1, 1] + MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] + MIWaveGroup: [1, 4] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 @@ -8999,20 +8999,20 @@ NoTailLoop: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 1 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -9097,24 +9097,24 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 38 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x32x16_MI32x32x1_SN_GSU14_MIWT1_1_NTA0_NTB1_SU0_SUS0_WSGRA0_WSGRB0_WGM56 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_GSU2_MIWT1_1_NTA0_NTB0_SU0_SUS0_WSGRA1_WSGRB1_WGM1 SourceSwap: 0 StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 StorePriorityOpt: 1 - StoreRemapVectorWidth: 4 + StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -9128,25 +9128,25 @@ VectorStore: -1 VectorWidthA: 1 VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 56 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [56, 0] - _DepthU: 16 - _DepthUA: 16 - _DepthUB: 16 - _DepthUMetadata: 16 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [8, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 56 + _WorkspaceSizePerElemC: 8 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -9161,7 +9161,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -9172,10 +9172,10 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -9183,65 +9183,65 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x64_MI16x16x1_SN_GSUM_MIWT1_1_NTA0_NTB1_SU0_SUS0_WSGRA0_WSGRB1_WGM1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 2 - LVCA: 8 - LVCB: 32 - LVPA: 4 - LVPB: 1 - LdsBlockSizePerPadA: 512 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x32_MI32x32x1_SN_GSU1_MIWT4_4_NTA0_NTB1_SU0_SUS0_WSGRA1_WSGRB1_WGM1 + LSCA: 256 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 13632 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 12800 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 12544 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 12544 - LdsPadA: 16 - LdsPadB: 4 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 + LdsOffsetMetadata: 12800 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 8 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 4 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] + MIBlock: [32, 32, 8, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 @@ -9256,22 +9256,22 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 3 + PrefetchLocalRead: 1 PreloadKernArgs: false ProblemType: Activation: false @@ -9348,7 +9348,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 39 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x64_MI16x16x1_SN_GSU4_MIWT1_1_NTA0_NTB1_SU0_SUS0_WSGRA0_WSGRB1_WGM1 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x32_MI32x32x1_SN_GSU1_MIWT4_4_NTA0_NTB1_SU0_SUS0_WSGRA1_WSGRB1_WGM1 SourceSwap: 1 StaggerU: 0 StaggerUMapping: 0 @@ -9356,17 +9356,17 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -9377,33 +9377,33 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadA: 1 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [64, 2, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [16, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + WorkspaceCheck: [0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 16 + _WorkspaceSizePerElemC: 0 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 + AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true @@ -9423,53 +9423,54 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 19 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x16x64_MI16x16x1_SN_GSUM_MIWT2_1_NTA1_NTB0_SU4_SUS256_WSGRA1_WSGRB1_WGM1 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_GSU1_LWPMn1_MIAV0_MIWT2_4_NEPBS0_SUS512_SPO1_SSO0_WSGRB1_WGM1 + LSCA: 256 LSCB: 64 LSPA: 4 - LSPB: 4 - LVCA: 16 - LVCB: 16 + LSPB: 8 + LVCA: 32 + LVCB: 8 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 25920 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 25600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 4 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 + LoopIters: 8 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [32, 32, 8, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 @@ -9477,22 +9478,22 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [2, 1] + MIWaveTile: [2, 4] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 @@ -9500,21 +9501,21 @@ NoReject: false NoTailLoop: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -9522,11 +9523,12 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 3 + PrefetchLocalRead: 1 PreloadKernArgs: false ProblemType: Activation: false ActivationComputeDataType: 0 + ActivationNoGuard: false ActivationType: none AllowNoFreeDims: false AssignedDerivedParameters: true @@ -9540,6 +9542,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 11 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -9594,30 +9597,30 @@ UseInitialStridesCD: false UseScaleAB: false UseScaleAlphaVec: false - UseScaleDVec: false + UseScaleCD: false ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 40 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x16x64_MI16x16x1_SN_GSU19_MIWT2_1_NTA1_NTB0_SU4_SUS256_WSGRA1_WSGRB1_WGM1 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_GSU1_LWPMn1_MIAV0_MIWT2_4_NEPBS0_SUS512_SPO1_SSO0_WSGRB1_WGM1 SourceSwap: 1 StaggerU: 4 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -9630,31 +9633,31 @@ VectorStore: -1 VectorWidthA: 2 VectorWidthB: 1 - WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [76, 0] + WorkspaceCheck: [0, 0] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 76 - _staggerStrideShift: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true @@ -9675,43 +9678,47 @@ GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 8 + GlobalSplitU: 19 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT32x16x128_MI16x16x1_SN_GSUM_MIWT1_1_NT0_NTA0_NTB0_NTC0_NTD0_NTM0_SU0_SUS0_WSGRA1_WSGRB1_WGM110 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT32x16x128_MI16x16x1_SN_GSUM_LBSPPA256_LBSPPB256_MIWT1_1_NEPBS16_SPO1_SSO0_WG32_4_1 LSCA: 32 LSCB: 128 - LSPA: 16 + LSPA: 2 LSPB: 4 LVCA: 4 LVCB: 16 - LVPA: 2 + LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 14848 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 2304 + LdsNumElements: 14976 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 2176 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 12544 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 12544 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 12800 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 12800 LdsPadA: 16 - LdsPadB: 16 + LdsPadB: 8 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -9719,7 +9726,7 @@ LoopIters: 8 LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: false + MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 @@ -9750,14 +9757,14 @@ NoLdsWriteCode: false NoReject: false NoTailLoop: false - NonTemporal: 0 + NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 16 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 @@ -9778,6 +9785,7 @@ ProblemType: Activation: false ActivationComputeDataType: 0 + ActivationNoGuard: false ActivationType: none AllowNoFreeDims: false AssignedDerivedParameters: true @@ -9791,6 +9799,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 11 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -9845,20 +9854,20 @@ UseInitialStridesCD: false UseScaleAB: false UseScaleAlphaVec: false - UseScaleDVec: false + UseScaleCD: false ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 41 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT32x16x128_MI16x16x1_SN_GSU8_MIWT1_1_NT0_NTA0_NTB0_NTC0_NTD0_NTM0_SU0_SUS0_WSGRA1_WSGRB1_WGM110 - SourceSwap: 0 - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT32x16x128_MI16x16x1_SN_GSU19_LBSPPA256_LBSPPB256_MIWT1_1_NEPBS16_SPO1_SSO0_WG32_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 SubGroup0: 8 SubGroup1: 16 SubGroupA: 8 @@ -9881,14 +9890,14 @@ VectorStore: -1 VectorWidthA: 1 VectorWidthB: 1 - WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 WorkGroup: [32, 4, 1] - WorkGroupMapping: 110 + WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [32, 0] + WorkspaceCheck: [76, 0] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 @@ -9897,15 +9906,15 @@ _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 32 + _WorkspaceSizePerElemC: 76 _staggerStrideShift: 0 - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true @@ -9926,39 +9935,43 @@ GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: 64 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_GSUM_MIWT1_1_NTA0_NTB0_SU0_SUS0_WSGRA1_WSGRB1_WGM1 - LSCA: 16 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT32x16x128_MI16x16x1_SN_GSUM_LBSPPA256_LBSPPB256_MIWT1_1_NEPBS0_SPO1_SSO2_WG32_4_1 + LSCA: 32 LSCB: 128 - LSPA: 32 + LSPA: 2 LSPB: 4 - LVCA: 2 + LVCA: 4 LVCB: 16 - LVPA: 4 + LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 27648 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 8704 + LdsNumElements: 14976 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 2176 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 18944 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 18944 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 12800 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 12800 LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 @@ -9970,7 +9983,7 @@ LoopIters: 8 LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: false + MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 @@ -9978,15 +9991,15 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] + MIWaveGroup: [2, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -10011,13 +10024,13 @@ NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -10029,6 +10042,7 @@ ProblemType: Activation: false ActivationComputeDataType: 0 + ActivationNoGuard: false ActivationType: none AllowNoFreeDims: false AssignedDerivedParameters: true @@ -10042,6 +10056,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 11 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -10096,24 +10111,24 @@ UseInitialStridesCD: false UseScaleAB: false UseScaleAlphaVec: false - UseScaleDVec: false + UseScaleCD: false ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 42 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_GSU2_MIWT1_1_NTA0_NTB0_SU0_SUS0_WSGRA1_WSGRB1_WGM1 - SourceSwap: 0 - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT32x16x128_MI16x16x1_SN_GSU64_LBSPPA256_LBSPPB256_MIWT1_1_NEPBS0_SPO1_SSO2_WG32_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StoreSyncOpt: 2 + StoreVectorWidth: 1 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 4 @@ -10132,31 +10147,31 @@ VectorStore: -1 VectorWidthA: 1 VectorWidthB: 1 - WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [8, 0] + WorkspaceCheck: [256, 0] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: null + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 8 + _WorkspaceSizePerElemC: 256 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true @@ -10175,45 +10190,49 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_GSU1_MIWT1_1_NTA1_NTB2_SU0_SUS0_WSGRA0_WSGRB1_WGM1 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT16x128x128_MI16x16x1_SN_LDSB1_GRVWA8_GSUM_LBSPPB256_MIWT1_2_NTB0_NEPBS0_SPO1_SSO0_WG16_16_1 LSCA: 16 LSCB: 128 - LSPA: 64 + LSPA: 4 LSPB: 4 - LVCA: 4 + LVCA: 2 LVCB: 16 - LVPA: 16 + LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadA: 128 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 27904 - LdsNumElementsAlignedA: 2304 - LdsNumElementsAlignedB: 9216 + LdsNumElements: 19968 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 2304 - LdsOffsetB_Blk: 18688 - LdsOffsetMetadata: 2304 - LdsOffsetMetadata_Blk: 18688 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 35328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 19968 + LdsOffsetMetadata_Blk: 35328 LdsPadA: 16 - LdsPadB: 16 + LdsPadB: 8 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -10221,7 +10240,7 @@ LoopIters: 8 LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: false + MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 @@ -10230,14 +10249,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 16 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 16 - MacroTileB: 64 + MacroTileB: 128 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -10253,21 +10272,21 @@ NoReject: false NoTailLoop: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -10280,6 +10299,7 @@ ProblemType: Activation: false ActivationComputeDataType: 0 + ActivationNoGuard: false ActivationType: none AllowNoFreeDims: false AssignedDerivedParameters: true @@ -10293,6 +10313,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 11 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -10347,20 +10368,20 @@ UseInitialStridesCD: false UseScaleAB: false UseScaleAlphaVec: false - UseScaleDVec: false + UseScaleCD: false ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 43 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_GSU1_MIWT1_1_NTA1_NTB2_SU0_SUS0_WSGRA0_WSGRB1_WGM1 - SourceSwap: 0 - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT16x128x128_MI16x16x1_SN_LDSB1_GRVWA8_GSU8_LBSPPB256_MIWT1_2_NTB0_NEPBS0_SPO1_SSO0_WG16_16_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -10368,9 +10389,9 @@ SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -10383,31 +10404,31 @@ VectorStore: -1 VectorWidthA: 1 VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [0, 0] + WorkspaceCheck: [32, 0] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: null + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 0 + _WorkspaceSizePerElemC: 32 _staggerStrideShift: 0 - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true @@ -10416,7 +10437,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -10428,96 +10449,100 @@ GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x64_MI32x32x1_SN_GSU1_MIWT2_2_NT0_NTA0_NTB0_NTC0_NTD0_NTM0_SU0_SUS0_WSGRA1_WSGRB0_WGM4 - LSCA: 128 - LSCB: 64 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT16x64x128_MI16x16x1_SN_LDSB0_GRVWA8_GSUM_LBSPPB256_MIWT1_1_NTB0_NEPBS16_SPO1_SSO2_WG16_16_1 + LSCA: 16 + LSCB: 128 LSPA: 4 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSPB: 4 + LVCA: 2 + LVCB: 16 LVPA: 1 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 17408 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 9216 + LdsNumElements: 27648 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 18944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 18944 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 8 - LoopUnroll: 64 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: false - MIBlock: [32, 32, 8, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false - NonTemporal: 0 + NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 @@ -10526,11 +10551,12 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 3 + PrefetchLocalRead: 1 PreloadKernArgs: false ProblemType: Activation: false ActivationComputeDataType: 0 + ActivationNoGuard: false ActivationType: none AllowNoFreeDims: false AssignedDerivedParameters: true @@ -10544,6 +10570,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 11 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -10598,30 +10625,30 @@ UseInitialStridesCD: false UseScaleAB: false UseScaleAlphaVec: false - UseScaleDVec: false + UseScaleCD: false ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 44 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x64_MI32x32x1_SN_GSU1_MIWT2_2_NT0_NTA0_NTB0_NTC0_NTD0_NTM0_SU0_SUS0_WSGRA1_WSGRB0_WGM4 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT16x64x128_MI16x16x1_SN_LDSB0_GRVWA8_GSU8_LBSPPB256_MIWT1_1_NTB0_NEPBS16_SPO1_SSO2_WG16_16_1 SourceSwap: 1 - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSyncOpt: 2 + StoreVectorWidth: 1 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -10632,33 +10659,33 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 - WaveSeparateGlobalReadA: 1 - WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadA: 2 + WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: null + WorkspaceCheck: [32, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 0 + _WorkspaceSizePerElemC: 32 _staggerStrideShift: 0 - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true @@ -10667,7 +10694,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 32 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -10681,73 +10708,77 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x32_MI32x32x1_SN_GSU1_MIWT4_2_NTA1_NTB1_SU0_SUS0_WSGRA0_WSGRB1_WGM24 - LSCA: 256 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 4 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT16x64x128_MI16x16x1_SN_LDSB0_GRVWA8_GSUM_LBSPPB256_MIWT1_1_NTB0_NEPBS0_SPO0_SSO2_WG16_16_1 + LSCA: 16 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 2 + LVCB: 16 LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 12800 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 4608 + LdsNumElements: 27648 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 - LdsOffsetMetadata: 12800 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 18944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 18944 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 32 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: false - MIBlock: [32, 32, 8, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 @@ -10755,21 +10786,21 @@ NoReject: false NoTailLoop: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -10782,6 +10813,7 @@ ProblemType: Activation: false ActivationComputeDataType: 0 + ActivationNoGuard: false ActivationType: none AllowNoFreeDims: false AssignedDerivedParameters: true @@ -10795,6 +10827,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 11 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -10849,30 +10882,30 @@ UseInitialStridesCD: false UseScaleAB: false UseScaleAlphaVec: false - UseScaleDVec: false + UseScaleCD: false ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 45 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x32_MI32x32x1_SN_GSU1_MIWT4_2_NTA1_NTB1_SU0_SUS0_WSGRA0_WSGRB1_WGM24 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT16x64x128_MI16x16x1_SN_LDSB0_GRVWA8_GSU1_LBSPPB256_MIWT1_1_NTB0_NEPBS0_SPO0_SSO2_WG16_16_1 SourceSwap: 1 - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreSyncOpt: 2 + StoreVectorWidth: 1 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -10883,33 +10916,33 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 24 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupReduction: false WorkspaceCheck: [0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 0 _staggerStrideShift: 0 - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true @@ -10918,7 +10951,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 32 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -10929,76 +10962,80 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x32_MI32x32x1_SN_GSU1_MIWT4_4_NTA0_NTB1_SU0_SUS0_WSGRA1_WSGRB1_WGM1 - LSCA: 256 - LSCB: 32 - LSPA: 2 - LSPB: 8 - LVCA: 32 - LVCB: 8 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT16x64x128_MI16x16x1_SN_LDSB0_GRVWA8_GSUM_LBSPPB256_MIWT1_1_NTB1_NEPBS0_SPO0_SSO0_WG16_16_1 + LSCA: 16 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 2 + LVCB: 16 LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 12800 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 4608 + LdsNumElements: 27648 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 - LdsOffsetMetadata: 12800 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 18944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 18944 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 32 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: false - MIBlock: [32, 32, 8, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 @@ -11009,19 +11046,19 @@ NonTemporalA: 0 NonTemporalB: 1 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -11033,6 +11070,7 @@ ProblemType: Activation: false ActivationComputeDataType: 0 + ActivationNoGuard: false ActivationType: none AllowNoFreeDims: false AssignedDerivedParameters: true @@ -11046,6 +11084,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 11 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -11100,30 +11139,30 @@ UseInitialStridesCD: false UseScaleAB: false UseScaleAlphaVec: false - UseScaleDVec: false + UseScaleCD: false ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 46 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x32_MI32x32x1_SN_GSU1_MIWT4_4_NTA0_NTB1_SU0_SUS0_WSGRA1_WSGRB1_WGM1 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT16x64x128_MI16x16x1_SN_LDSB0_GRVWA8_GSU1_LBSPPB256_MIWT1_1_NTB1_NEPBS0_SPO0_SSO0_WG16_16_1 SourceSwap: 1 - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 SubGroup0: 4 - SubGroup1: 32 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 32 + SubGroupB: 64 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -11134,32 +11173,32 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 1 - WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [64, 2, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupReduction: false WorkspaceCheck: [0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 0 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -11181,7 +11220,7 @@ GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 24 + GlobalSplitU: 9 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 1 GroupLoadStore: false @@ -11192,10 +11231,10 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x32x128_MI16x16x1_SN_LDSB0_GSUM_LBSPPB256_LWPM0p40_MIWT1_1_NTB1_NEPBS0_SPO0_SSO0_WSGRB1_WG16_8_1 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT16x128x128_MI16x16x1_SN_LDSB1_GRVWA8_GSUM_LBSPPB256_MIWT1_2_NTB0_NEPBS0_SPO0_SSO0_WG16_16_1 LSCA: 16 LSCB: 128 - LSPA: 2 + LSPA: 4 LSPB: 4 LVCA: 2 LVCB: 16 @@ -11205,22 +11244,25 @@ LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 15104 + LdsNumElements: 19968 LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4352 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 32768 LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 + LdsOffsetB_Blk: 35328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 19968 + LdsOffsetMetadata_Blk: 35328 LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: 0.4 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 8 @@ -11234,15 +11276,15 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] + MIWaveGroup: [1, 4] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 16 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 16 - MacroTileB: 32 + MacroTileB: 128 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -11259,21 +11301,21 @@ NoTailLoop: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 1 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -11299,6 +11341,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 11 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -11358,25 +11401,25 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 47 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x32x128_MI16x16x1_SN_LDSB0_GSU24_LBSPPB256_LWPM0p40_MIWT1_1_NTB1_NEPBS0_SPO0_SSO0_WSGRB1_WG16_8_1 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT16x128x128_MI16x16x1_SN_LDSB1_GRVWA8_GSU9_LBSPPB256_MIWT1_2_NTB0_NEPBS0_SPO0_SSO0_WG16_16_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 SubGroup0: 4 - SubGroup1: 32 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 32 + SubGroupB: 64 SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -11393,10 +11436,10 @@ WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [96, 0] + WorkspaceCheck: [36, 0] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 @@ -11405,14 +11448,14 @@ _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 96 + _WorkspaceSizePerElemC: 36 _staggerStrideShift: 0 - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -11433,8 +11476,8 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 62 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 1 GroupLoadStore: false @@ -11445,35 +11488,38 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x16x128_MI16x16x1_SN_LDSB0_GSUM_LBSPPB256_LWPM0p40_MIWT1_1_NTB0_NEPBS16_SPO0_SSO2_WSGRB1_WG16_4_1 - LSCA: 16 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT64x16x128_MI16x16x1_SN_LDSB0_GRVWB4_GSUM_LBSPPA512_LBSPPB256_MIWT1_1_NTA1_NEPBS16_NLCA1_SPO0_SSO2_SVW1_VWA1_WG64_4_1 + LSCA: 64 LSCB: 128 - LSPA: 1 - LSPB: 4 - LVCA: 2 - LVCB: 16 + LSPA: 4 + LSPB: 2 + LVCA: 8 + LVCB: 32 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 12928 - LdsNumElementsAlignedA: 2560 + LdsNumElements: 27200 + LdsNumElementsAlignedA: 8704 LdsNumElementsAlignedB: 2176 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 25088 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 25088 LdsPadA: 16 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: 0.4 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 8 @@ -11487,14 +11533,14 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [4, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 16 MagicDivAlg: 2 MatrixInstB: 1 @@ -11511,7 +11557,7 @@ NoReject: false NoTailLoop: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 1 NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 1 @@ -11521,12 +11567,12 @@ NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -11552,6 +11598,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 11 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -11611,18 +11658,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 48 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x16x128_MI16x16x1_SN_LDSB0_GSU62_LBSPPB256_LWPM0p40_MIWT1_1_NTB0_NEPBS16_SPO0_SSO2_WSGRB1_WG16_4_1 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT64x16x128_MI16x16x1_SN_LDSB0_GRVWB4_GSU1_LBSPPA512_LBSPPB256_MIWT1_1_NTA1_NEPBS16_NLCA1_SPO0_SSO2_SVW1_VWA1_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSyncOpt: 2 StoreVectorWidth: 1 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] @@ -11646,26 +11693,26 @@ WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [248, 0] + WorkspaceCheck: [0, 0] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 248 + _WorkspaceSizePerElemC: 0 _staggerStrideShift: 0 - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -11675,7 +11722,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 128 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -11686,10 +11733,10 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 9 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11698,39 +11745,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x128x128_MI16x16x1_SN_LDSB1_GSUM_LBSPPB256_LWPM0p40_MIWT1_4_NTB0_NEPBS0_SPO1_SSO2_WSGRB1_WG16_8_1 - LSCA: 16 - LSCB: 128 - LSPA: 2 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x16x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA2048_LBSPPB128_MIWT4_1_NTA0_NEPBS0_NLCA1_SPO0_SSO0_SVW4_VWA4_WG64_4_1 + LSCA: 256 + LSCB: 64 + LSPA: 4 LSPB: 4 - LVCA: 2 + LVCA: 32 LVCB: 16 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 19968 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 17408 + LdsNumElements: 17792 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 1152 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 35328 - LdsOffsetMetadata: 19968 - LdsOffsetMetadata_Blk: 35328 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17792 + LdsOffsetMetadata_Blk: 49408 LdsPadA: 16 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: 0.4 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] @@ -11740,15 +11790,15 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 4] - MIWaveTileA: 1 - MIWaveTileB: 4 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 128 - MacroTileA: 16 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -11772,14 +11822,14 @@ NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 16 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -11805,6 +11855,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 11 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -11864,25 +11915,25 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 49 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x128x128_MI16x16x1_SN_LDSB1_GSU8_LBSPPB256_LWPM0p40_MIWT1_4_NTB0_NEPBS0_SPO1_SSO2_WSGRB1_WG16_8_1 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x16x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU9_LBSPPA2048_LBSPPB128_MIWT4_1_NTA0_NEPBS0_NLCA1_SPO0_SSO0_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSyncOpt: 2 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -11893,32 +11944,32 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [32, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + WorkspaceCheck: [36, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 32 - _staggerStrideShift: 0 + _WorkspaceSizePerElemC: 36 + _staggerStrideShift: 1 - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -11939,8 +11990,8 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 7 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 1 GroupLoadStore: false @@ -11951,31 +12002,34 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_LDSB0_GSUM_LBSPPB256_LWPMn1_MIWT1_1_NTB1_NEPBS16_SPO1_SSO2_WSGRB1_WG16_16_1 - LSCA: 16 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT64x16x128_MI16x16x1_SN_LDSB0_GRVWB4_GSUM_LBSPPA512_LBSPPB256_MIWT1_1_NTA1_NEPBS16_NLCA1_SPO1_SSO0_SVW1_VWA1_WG64_4_1 + LSCA: 64 LSCB: 128 LSPA: 4 - LSPB: 4 - LVCA: 2 - LVCB: 16 + LSPB: 2 + LVCA: 8 + LVCB: 32 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 27648 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 8704 + LdsNumElements: 27200 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 2176 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 16384 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 18944 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 18944 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 25088 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 25088 LdsPadA: 16 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -11993,15 +12047,15 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] + MIWaveGroup: [4, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -12017,8 +12071,8 @@ NoReject: false NoTailLoop: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 + NonTemporalA: 1 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 1 NonTemporalE: 0 @@ -12026,12 +12080,12 @@ NumElementsPerBatchStore: 16 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -12058,6 +12112,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 11 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -12117,19 +12172,19 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 50 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_LDSB0_GSU8_LBSPPB256_LWPMn1_MIWT1_1_NTB1_NEPBS16_SPO1_SSO2_WSGRB1_WG16_16_1 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT64x16x128_MI16x16x1_SN_LDSB0_GRVWB4_GSU7_LBSPPA512_LBSPPB256_MIWT1_1_NTA1_NEPBS16_NLCA1_SPO1_SSO0_SVW1_VWA1_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSyncOpt: 2 + StoreSyncOpt: 0 StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 4 @@ -12152,10 +12207,10 @@ WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [32, 0] + WorkspaceCheck: [28, 0] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 @@ -12164,14 +12219,14 @@ _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 32 + _WorkspaceSizePerElemC: 28 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -12181,7 +12236,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -12192,10 +12247,10 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 + GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -12204,39 +12259,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x16x64_MI16x16x1_SN_LDSB0_GSU1_LBSPPB128_LWPMn1_MIWT1_1_NTB0_NEPBS0_SPO0_SSO0_WSGRB0_WG16_4_1 - LSCA: 16 - LSCB: 64 - LSPA: 1 - LSPB: 8 - LVCA: 2 - LVCB: 8 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT128x16x128_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA1024_LBSPPB256_MIWT2_1_NTA1_NEPBS0_NLCA1_SPO0_SSO2_SVW2_VWA2_WG64_4_1 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 6528 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 19072 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 2176 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 19072 + LdsOffsetMetadata_Blk: 49664 LdsPadA: 16 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] @@ -12246,14 +12304,14 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 128 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 128 MacroTileB: 16 MagicDivAlg: 2 MatrixInstB: 1 @@ -12270,22 +12328,22 @@ NoReject: false NoTailLoop: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 1 NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumLoadsA: 8 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -12311,6 +12369,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 11 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -12370,24 +12429,24 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 51 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x16x64_MI16x16x1_SN_LDSB0_GSU1_LBSPPB128_LWPMn1_MIWT1_1_NTB0_NEPBS0_SPO0_SSO0_WSGRB0_WG16_4_1 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT128x16x128_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA1024_LBSPPB256_MIWT2_1_NTA1_NEPBS0_NLCA1_SPO0_SSO2_SVW2_VWA2_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 + StoreSyncOpt: 2 + StoreVectorWidth: 2 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 8 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -12399,32 +12458,32 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false WorkspaceCheck: [0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 1 - - 1LDSBuffer: 1 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -12445,8 +12504,8 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 9 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 1 GroupLoadStore: false @@ -12457,35 +12516,38 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x128x128_MI16x16x1_SN_LDSB1_GSUM_LBSPPB256_LWPM0p40_MIWT1_4_NTB1_NEPBS16_SPO1_SSO2_WSGRB1_WG16_8_1 - LSCA: 16 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT64x16x128_MI16x16x1_SN_LDSB0_GRVWB4_GSUM_LBSPPA512_LBSPPB256_MIWT1_1_NTA0_NEPBS16_NLCA1_SPO1_SSO2_SVW1_VWA1_WG64_4_1 + LSCA: 64 LSCB: 128 - LSPA: 2 - LSPB: 4 - LVCA: 2 - LVCB: 16 + LSPA: 4 + LSPB: 2 + LVCA: 8 + LVCB: 32 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 19968 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 17408 + LdsNumElements: 27200 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 2176 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 35328 - LdsOffsetMetadata: 19968 - LdsOffsetMetadata_Blk: 35328 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 25088 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 25088 LdsPadA: 16 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: 0.4 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 8 @@ -12499,15 +12561,15 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 4] + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 4 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 128 - MacroTileA: 16 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -12524,21 +12586,21 @@ NoTailLoop: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 1 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 16 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -12564,6 +12626,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 11 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -12623,25 +12686,25 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 52 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x128x128_MI16x16x1_SN_LDSB1_GSU9_LBSPPB256_LWPM0p40_MIWT1_4_NTB1_NEPBS16_SPO1_SSO2_WSGRB1_WG16_8_1 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT64x16x128_MI16x16x1_SN_LDSB0_GRVWB4_GSU1_LBSPPA512_LBSPPB256_MIWT1_1_NTA0_NEPBS16_NLCA1_SPO1_SSO2_SVW1_VWA1_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSyncOpt: 2 StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 1 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -12658,26 +12721,26 @@ WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [36, 0] + WorkspaceCheck: [0, 0] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 36 + _WorkspaceSizePerElemC: 0 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -12687,7 +12750,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -12698,10 +12761,10 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 2 GlobalSplitU: 7 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -12710,39 +12773,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x64_MI16x16x1_SN_GRVWB4_GSUM_LBSPPA512_LPB4_LRVW4_LWPM0p45_MIWT1_1_NTA0_NEPBS0_SPO0_SSO0_SVW1_VWA1_WG64_4_1 - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x16x32_MI16x16x1_SN_LDSB1_GRVWB2_GSUM_LBSPPA2048_LBSPPB128_MIWT4_1_NTA1_NEPBS16_NLCA1_SPO0_SSO2_SVW4_VWA4_WG64_4_1 + LSCA: 256 + LSCB: 32 LSPA: 4 LSPB: 4 - LVCA: 8 + LVCA: 32 LVCB: 16 LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 + LVPB: 2 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 13632 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 8960 + LdsNumElementsAlignedA: 8320 + LdsNumElementsAlignedB: 640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 12544 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 12544 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8320 + LdsOffsetB_Blk: 24704 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8960 + LdsOffsetMetadata_Blk: 24704 LdsPadA: 16 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: 0.45 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] @@ -12753,13 +12819,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveTile: [4, 1] + MIWaveTileA: 4 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 256 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 256 MacroTileB: 16 MagicDivAlg: 2 MatrixInstB: 1 @@ -12774,22 +12840,22 @@ MinVgprNumber: 0 NoLdsWriteCode: false NoReject: false - NoTailLoop: false + NoTailLoop: true NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 1 NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -12817,6 +12883,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 11 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -12876,24 +12943,24 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 53 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x64_MI16x16x1_SN_GRVWB4_GSU7_LBSPPA512_LPB4_LRVW4_LWPM0p45_MIWT1_1_NTA0_NEPBS0_SPO0_SSO0_SVW1_VWA1_WG64_4_1 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x16x32_MI16x16x1_SN_LDSB1_GRVWB2_GSU7_LBSPPA2048_LBSPPB128_MIWT4_1_NTA1_NEPBS16_NLCA1_SPO0_SSO2_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUMapping: 2 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 2 + StoreVectorWidth: 4 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -12905,7 +12972,7 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 @@ -12915,22 +12982,22 @@ WorkGroupMapping: 1 WorkGroupReduction: false WorkspaceCheck: [28, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 28 - _staggerStrideShift: 0 - - 1LDSBuffer: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -12951,10 +13018,10 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -12963,42 +13030,45 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x64_MI16x16x1_SN_GRVWB4_GSU1_LBSPPA512_LPB4_LRVW4_LWPMn1_MIWT1_1_NTA0_NEPBS0_SPO0_SSO0_SVW1_VWA1_WG64_4_1 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x128x64_MI32x32x1_SN_LDSB1_GRVWB8_GSUM_LBSPPA0_LPA0_LPB8_LRVW8_LWPMn1_MIWT2_4_SUM2_SVW2_VWA2_WG128_2_1 + LSCA: 256 LSCB: 64 - LSPA: 4 + LSPA: 2 LSPB: 4 - LVCA: 8 - LVCB: 16 + LVCA: 32 + LVCB: 8 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 13632 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 25600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 12544 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 12544 - LdsPadA: 16 - LdsPadB: 4 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 + LoopIters: 8 LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 16, 1, 1, 1] + MIBlock: [32, 32, 8, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 @@ -13006,22 +13076,22 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 @@ -13032,18 +13102,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -13070,6 +13140,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 11 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -13129,25 +13200,25 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 54 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x64_MI16x16x1_SN_GRVWB4_GSU1_LBSPPA512_LPB4_LRVW4_LWPMn1_MIWT1_1_NTA0_NEPBS0_SPO0_SSO0_SVW1_VWA1_WG64_4_1 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x128x64_MI32x32x1_SN_LDSB1_GRVWB8_GSU1_LBSPPA0_LPA0_LPB8_LRVW8_LWPMn1_MIWT2_4_SUM2_SVW2_VWA2_WG128_2_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUMapping: 2 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreVectorWidth: 2 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -13158,13 +13229,13 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 WorkGroupReduction: false WorkspaceCheck: [0, 0] @@ -13177,13 +13248,13 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 0 + _staggerStrideShift: 1 - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -13216,35 +13287,38 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_GSU1_LWPM0p45_MIWT2_4_NTD1_SUS256_SPO0_SSO2_SVW2_VWA2_WSGRB0_WG128_2_1_WGM1 - LSCA: 256 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT128x128x64_MI32x32x1_SN_LDSB1_GRVWB8_GSUM_LBSPPA0_LPA0_LPB8_LRVW8_LWPMn1_MIWT2_2_SUM2_SVW2_VWA2_WG64_4_1 + LSCA: 128 LSCB: 64 LSPA: 4 - LSPB: 32 - LVCA: 32 + LSPB: 4 + LVCA: 16 LVCB: 8 LVPA: 1 - LVPB: 4 + LVPB: 1 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 25600 - LdsNumElementsAlignedA: 16384 + LdsNumElements: 17408 + LdsNumElementsAlignedA: 8192 LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 - LdsOffsetMetadata: 25600 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 - LocalWritePerMfma: 0.45 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 8 @@ -13258,14 +13332,14 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 256 + MacroTileA: 128 MacroTileB: 128 MagicDivAlg: 2 MatrixInstB: 1 @@ -13285,17 +13359,17 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 @@ -13323,6 +13397,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 11 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -13382,25 +13457,25 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 55 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_GSU1_LWPM0p45_MIWT2_4_NTD1_SUS256_SPO0_SSO2_SVW2_VWA2_WSGRB0_WG128_2_1_WGM1 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT128x128x64_MI32x32x1_SN_LDSB1_GRVWB8_GSU1_LBSPPA0_LPA0_LPB8_LRVW8_LWPMn1_MIWT2_2_SUM2_SVW2_VWA2_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSyncOpt: 2 + StoreSyncOpt: 0 StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 32 - ThreadTileB: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -13413,11 +13488,11 @@ VectorStore: -1 VectorWidthA: 2 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false WorkspaceCheck: [0, 0] @@ -13436,7 +13511,7 @@ ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -13446,7 +13521,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -13457,10 +13532,10 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 2 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13469,42 +13544,45 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x32_MI32x32x1_SN_GSU1_LWPM0p40_MIWT2_4_NTD1_SUS128_SPO0_SSO2_SVW2_VWA2_WSGRB1_WG128_2_1_WGM1 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x112x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_7_SUM2_SVW4_VWA4_WG64_4_1 LSCA: 256 - LSCB: 32 - LSPA: 4 - LSPB: 16 + LSCB: 64 + LSPA: 2 + LSPB: 4 LVCA: 32 - LVCB: 4 + LVCB: 16 LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 0 + LVPB: 1 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 12800 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 4608 + LdsNumElements: 24320 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 7680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 - LdsOffsetMetadata: 12800 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 - LdsPadB: 8 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24320 + LdsOffsetMetadata_Blk: 49408 + LdsPadA: 16 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: 0.4 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 4 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 8, 1, 1, 1] + MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 @@ -13512,44 +13590,44 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 128 + MacroTile1: 112 MacroTileA: 256 - MacroTileB: 128 + MacroTileB: 112 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 NoLdsWriteCode: false NoReject: false - NoTailLoop: true + NoTailLoop: false NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -13576,6 +13654,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 11 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -13635,25 +13714,25 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 56 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x32_MI32x32x1_SN_GSU1_LWPM0p40_MIWT2_4_NTD1_SUS128_SPO0_SSO2_SVW2_VWA2_WSGRB1_WG128_2_1_WGM1 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x112x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU2_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_7_SUM2_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUMapping: 2 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSyncOpt: 2 - StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -13664,25 +13743,25 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: null + WorkspaceCheck: [8, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 0 + _WorkspaceSizePerElemC: 8 _staggerStrideShift: 1 - 1LDSBuffer: 1 ActivationAlt: false @@ -13710,10 +13789,10 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 + GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13722,65 +13801,68 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x64_MI32x32x1_SN_GRVWB8_GSU1_LBSPPA0_LPA0_LPB8_LWPMn1_MIWT2_2_NTD1_NEPBS0_SU4_SUS256_SPO0_SSO0 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x112x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_7_SUM1_SVW4_VWA4_WG64_4_1 + LSCA: 256 LSCB: 64 - LSPA: 4 + LSPA: 2 LSPB: 4 - LVCA: 16 - LVCB: 8 + LVCA: 32 + LVCB: 16 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 17408 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 9216 + LdsNumElements: 24320 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 7680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 - LdsPadB: 8 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24320 + LdsOffsetMetadata_Blk: 49408 + LdsPadA: 16 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 + LoopIters: 4 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 8, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 112 + MacroTileA: 256 + MacroTileB: 112 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 @@ -13791,18 +13873,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -13889,25 +13971,25 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 57 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x64_MI32x32x1_SN_GRVWB8_GSU1_LBSPPA0_LPA0_LPB8_LWPMn1_MIWT2_2_NTD1_NEPBS0_SU4_SUS256_SPO0_SSO0 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x112x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_7_SUM1_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 1 StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -13918,9 +14000,9 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 + WaveSeparateGlobalReadA: 1 WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 @@ -13943,7 +14025,7 @@ ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -13964,10 +14046,10 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 2 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13976,42 +14058,45 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_GSUM_LWPM0p40_MIAV0_MIWT2_4_NEPBS0_SUS256_SPO1_SSO0_WSGRB1_WGM32 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA2048_LPA16_LPB4_LRVW4_LWPM0p40_MIWT4_14_SUM1_SVW4_VWA4_WG64_4_1 LSCA: 256 LSCB: 64 - LSPA: 4 - LSPB: 8 + LSPA: 2 + LSPB: 4 LVCA: 32 - LVCB: 8 + LVCB: 16 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 25600 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 9216 + LdsNumElements: 31872 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 15232 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 - LdsOffsetMetadata: 25600 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 8 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 31872 + LdsOffsetMetadata_Blk: 49408 + LdsPadA: 16 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: 0.4 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 + LoopIters: 4 LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 8, 1, 1, 1] + MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 @@ -14019,22 +14104,22 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 128 + MacroTile1: 224 MacroTileA: 256 - MacroTileB: 128 + MacroTileB: 224 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 @@ -14045,18 +14130,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 14 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 14 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -14143,25 +14228,25 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 58 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_GSU2_LWPM0p40_MIAV0_MIWT2_4_NEPBS0_SUS256_SPO1_SSO0_WSGRB1_WGM32 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA2048_LPA16_LPB4_LRVW4_LWPM0p40_MIWT4_14_SUM1_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 1 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -14172,32 +14257,32 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 32 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [8, 0] + WorkspaceCheck: [0, 0] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 8 + _WorkspaceSizePerElemC: 0 _staggerStrideShift: 1 - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -14218,10 +14303,10 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 + GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14230,42 +14315,45 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_GSU1_LWPM0p40_MIAV0_MIWT2_4_NEPBS0_SUS512_SPO1_SSO2_WSGRB0_WGM32 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_14_SUM2_SVW4_VWA4_WG64_4_1 LSCA: 256 LSCB: 64 - LSPA: 4 - LSPB: 32 + LSPA: 2 + LSPB: 4 LVCA: 32 - LVCB: 8 + LVCB: 16 LVPA: 1 - LVPB: 4 - LdsBlockSizePerPadA: 0 + LVPB: 1 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 25600 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 9216 + LdsNumElements: 31872 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 15232 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 - LdsOffsetMetadata: 25600 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 8 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 31872 + LdsOffsetMetadata_Blk: 49408 + LdsPadA: 16 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: 0.4 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 + LoopIters: 4 LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 8, 1, 1, 1] + MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 @@ -14273,22 +14361,22 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 128 + MacroTile1: 224 MacroTileA: 256 - MacroTileB: 128 + MacroTileB: 224 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 @@ -14299,18 +14387,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 14 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 14 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -14397,25 +14485,25 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 59 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_GSU1_LWPM0p40_MIAV0_MIWT2_4_NEPBS0_SUS512_SPO1_SSO2_WSGRB0_WGM32 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_14_SUM2_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSyncOpt: 2 - StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -14426,14 +14514,14 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 32 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 WorkGroupReduction: false WorkspaceCheck: [0, 0] _DepthU: 64 @@ -14445,13 +14533,13 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 2 + _staggerStrideShift: 1 - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -14461,7 +14549,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -14471,8 +14559,8 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 4 @@ -14484,39 +14572,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x240x32_MI16x16x1_SN_GRVWA2_GSU1_LWPMn1_MIWT4_15_NEPBS0_NLCA2_SUS256_SPO0_SSO0_WSGRB1_WGM32 - LSCA: 128 - LSCB: 32 - LSPA: 4 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x112x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_7_SUM2_SVW4_VWA4_WG64_4_1 + LSCA: 256 + LSCB: 64 + LSPA: 2 LSPB: 4 - LVCA: 64 + LVCA: 32 LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 4096 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 17920 - LdsNumElementsAlignedA: 8320 - LdsNumElementsAlignedB: 9600 + LdsNumElements: 24320 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 7680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8320 - LdsOffsetB_Blk: 41088 - LdsOffsetMetadata: 17920 - LdsOffsetMetadata_Blk: 41088 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24320 + LdsOffsetMetadata_Blk: 49408 LdsPadA: 16 - LdsPadB: 16 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] @@ -14527,14 +14618,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [4, 15] + MIWaveTile: [4, 7] MIWaveTileA: 4 - MIWaveTileB: 15 + MIWaveTileB: 7 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 240 + MacroTile1: 112 MacroTileA: 256 - MacroTileB: 240 + MacroTileB: 112 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -14548,23 +14639,23 @@ MinVgprNumber: 0 NoLdsWriteCode: false NoReject: false - NoTailLoop: true + NoTailLoop: false NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 16 - NumLoadsB: 15 - NumLoadsCoalescedA: 2 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 15 + NumLoadsPerpendicularB: 7 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -14651,10 +14742,10 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 60 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x240x32_MI16x16x1_SN_GRVWA2_GSU1_LWPMn1_MIWT4_15_NEPBS0_NLCA2_SUS256_SPO0_SSO0_WSGRB1_WGM32 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x112x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_7_SUM2_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 @@ -14667,9 +14758,9 @@ SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 15 + ThreadTile1: 7 ThreadTileA: 16 - ThreadTileB: 15 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -14682,30 +14773,30 @@ VectorStore: -1 VectorWidthA: 4 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 + WorkGroupMapping: 1 WorkGroupReduction: false WorkspaceCheck: [0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 2 + _staggerStrideShift: 1 - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -14726,10 +14817,10 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 + GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14738,42 +14829,45 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_GSU1_LWPMn1_MIAV0_MIWT2_4_NEPBS0_SUS512_SPO1_SSO0_WSGRB1_WGM1 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA2048_LPA16_LPB4_LRVW4_LWPM0p40_MIWT4_14_SUM2_SVW4_VWA4_WG64_4_1 LSCA: 256 LSCB: 64 - LSPA: 4 - LSPB: 8 + LSPA: 2 + LSPB: 4 LVCA: 32 - LVCB: 8 + LVCB: 16 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 25600 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 9216 + LdsNumElements: 31872 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 15232 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 - LdsOffsetMetadata: 25600 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 8 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 31872 + LdsOffsetMetadata_Blk: 49408 + LdsPadA: 16 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: -1 + LocalWritePerMfma: 0.4 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 + LoopIters: 4 LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 8, 1, 1, 1] + MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 @@ -14781,22 +14875,22 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 128 + MacroTile1: 224 MacroTileA: 256 - MacroTileB: 128 + MacroTileB: 224 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 @@ -14807,18 +14901,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 14 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 14 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -14905,25 +14999,25 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 61 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_GSU1_LWPMn1_MIAV0_MIWT2_4_NEPBS0_SUS512_SPO1_SSO0_WSGRB1_WGM1 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA2048_LPA16_LPB4_LRVW4_LWPM0p40_MIWT4_14_SUM2_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -14934,13 +15028,13 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false WorkspaceCheck: [0, 0] @@ -14953,13 +15047,13 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 2 + _staggerStrideShift: 1 - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -14969,7 +15063,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -14992,39 +15086,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x144x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LWPMn1_MIAV0_MIWT4_9_NEPBS0_SUS256_SPO0_SSO2_WSGRB0_WGM1 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x128x32_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_8_SUM2_SVW4_VWA4_WG64_4_1 LSCA: 256 - LSCB: 64 - LSPA: 4 - LSPB: 16 + LSCB: 32 + LSPA: 2 + LSPB: 4 LVCA: 32 - LVCB: 16 + LVCB: 8 LVPA: 1 - LVPB: 4 - LdsBlockSizePerPadA: 4096 + LVPB: 1 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 28032 - LdsNumElementsAlignedA: 16512 - LdsNumElementsAlignedB: 11520 + LdsNumElements: 12672 + LdsNumElementsAlignedA: 8320 + LdsNumElementsAlignedB: 4352 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16512 - LdsOffsetB_Blk: 49280 - LdsOffsetMetadata: 28032 - LdsOffsetMetadata_Blk: 49280 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8320 + LdsOffsetB_Blk: 24704 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 12672 + LdsOffsetMetadata_Blk: 24704 LdsPadA: 16 - LdsPadB: 16 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] @@ -15035,14 +15132,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [4, 9] + MIWaveTile: [4, 8] MIWaveTileA: 4 - MIWaveTileB: 9 + MIWaveTileB: 8 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 144 + MacroTile1: 128 MacroTileA: 256 - MacroTileB: 144 + MacroTileB: 128 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -15056,23 +15153,23 @@ MinVgprNumber: 0 NoLdsWriteCode: false NoReject: false - NoTailLoop: false + NoTailLoop: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 8 - NumLoadsB: 9 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 9 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -15159,14 +15256,14 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 62 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x144x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LWPMn1_MIAV0_MIWT4_9_NEPBS0_SUS256_SPO0_SSO2_WSGRB0_WGM1 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x128x32_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_8_SUM2_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSyncOpt: 2 + StoreSyncOpt: 0 StoreVectorWidth: 4 SubGroup0: 16 SubGroup1: 16 @@ -15175,9 +15272,9 @@ SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 9 + ThreadTile1: 8 ThreadTileA: 16 - ThreadTileB: 9 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -15190,30 +15287,30 @@ VectorStore: -1 VectorWidthA: 4 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false WorkspaceCheck: [0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 1 - - 1LDSBuffer: 1 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -15223,7 +15320,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -15234,7 +15331,7 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 4 @@ -15246,39 +15343,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x144x64_MI16x16x1_SN_LDSB1_GRVWB2_GSU1_LWPMn1_MIAV0_MIWT4_9_NEPBS0_SUS256_SPO0_SSO0_WSGRB1_WGM1 + KernelNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x224x32_MI16x16x1_SN_LDSB0_GRVWB4_GSUM_LBSPPA2048_LPA16_LPB4_LRVW4_LWPM0p40_MIWT4_14_SUM2_SVW4_VWA4_WG64_4_1 LSCA: 256 - LSCB: 64 - LSPA: 4 - LSPB: 2 + LSCB: 32 + LSPA: 2 + LSPB: 4 LVCA: 32 - LVCB: 32 + LVCB: 8 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 28032 - LdsNumElementsAlignedA: 16512 - LdsNumElementsAlignedB: 11520 + LdsNumElements: 32320 + LdsNumElementsAlignedA: 8320 + LdsNumElementsAlignedB: 7680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16512 - LdsOffsetB_Blk: 49280 - LdsOffsetMetadata: 28032 - LdsOffsetMetadata_Blk: 49280 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8320 + LdsOffsetB_Blk: 24704 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8320 + LdsOffsetMetadata_Blk: 24704 LdsPadA: 16 - LdsPadB: 16 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: -1 + LocalWritePerMfma: 0.4 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] @@ -15289,14 +15389,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [4, 9] + MIWaveTile: [4, 14] MIWaveTileA: 4 - MIWaveTileB: 9 + MIWaveTileB: 14 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 144 + MacroTile1: 224 MacroTileA: 256 - MacroTileB: 144 + MacroTileB: 224 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -15310,23 +15410,23 @@ MinVgprNumber: 0 NoLdsWriteCode: false NoReject: false - NoTailLoop: false + NoTailLoop: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 8 - NumLoadsB: 18 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 4 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 18 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 7 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -15413,10 +15513,10 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 63 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x144x64_MI16x16x1_SN_LDSB1_GRVWB2_GSU1_LWPMn1_MIAV0_MIWT4_9_NEPBS0_SUS256_SPO0_SSO0_WSGRB1_WGM1 + SolutionNameMin: Cijk_Ailk_Bljk_HF8_HHS_BH_MT256x224x32_MI16x16x1_SN_LDSB0_GRVWB4_GSU1_LBSPPA2048_LPA16_LPB4_LRVW4_LWPM0p40_MIWT4_14_SUM2_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 @@ -15429,9 +15529,9 @@ SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 9 + ThreadTile1: 14 ThreadTileA: 16 - ThreadTileB: 9 + ThreadTileB: 14 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -15444,24 +15544,24 @@ VectorStore: -1 VectorWidthA: 4 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false WorkspaceCheck: [0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 1 + _staggerStrideShift: 2 - [2, 3, 0, 1] - - - [1104, 1, 1, 4608] - [0, 675.057] @@ -15544,55 +15644,55 @@ - - [9216, 1408, 1, 768] - [36, 368290.0] - - [16, 16, 1, 1024] - - [41, 0.0] + - [37, 0.0] - - [16, 16, 1, 8192] - - [47, 0.0] + - [41, 0.0] - - [16, 16, 1, 65536] - - [48, 0.0] - - - [16, 2048, 1, 1024] - [42, 0.0] + - - [16, 2048, 1, 1024] + - [38, 0.0] - - [16, 2048, 1, 8192] - - [49, 0.0] + - [43, 0.0] - - [16, 2048, 1, 65536] - - [50, 0.0] + - [44, 0.0] - - [16, 8192, 1, 1024] - - [43, 0.0] + - [45, 0.0] - - [16, 8192, 1, 8192] - - [51, 0.0] + - [46, 0.0] - - [16, 8192, 1, 65536] - - [52, 0.0] + - [47, 0.0] - - [2048, 16, 1, 1024] - - [39, 0.0] + - [48, 0.0] - - [2048, 16, 1, 8192] - - [53, 0.0] + - [49, 0.0] - - [2048, 16, 1, 65536] - - [40, 0.0] + - [50, 0.0] - - [2048, 2048, 1, 1024] - - [57, 0.0] + - [54, 0.0] - - [2048, 2048, 1, 8192] - - [44, 0.0] + - [55, 0.0] - - [2048, 2048, 1, 65536] - - [58, 0.0] + - [56, 0.0] - - [2048, 8192, 1, 1024] - - [45, 0.0] + - [57, 0.0] - - [2048, 8192, 1, 8192] - - [59, 0.0] + - [58, 0.0] - - [2048, 8192, 1, 65536] - - [60, 0.0] + - [59, 0.0] - - [8192, 16, 1, 1024] - - [37, 0.0] + - [51, 0.0] - - [8192, 16, 1, 8192] - - [54, 0.0] + - [52, 0.0] - - [8192, 16, 1, 65536] - - [38, 0.0] + - [53, 0.0] - - [8192, 2048, 1, 1024] - - [61, 0.0] + - [40, 0.0] - - [8192, 2048, 1, 8192] - - [55, 0.0] + - [60, 0.0] - - [8192, 2048, 1, 65536] - - [46, 0.0] + - [39, 0.0] - - [8192, 8192, 1, 1024] - - [56, 0.0] + - [61, 0.0] - - [8192, 8192, 1, 8192] - [62, 0.0] - - [8192, 8192, 1, 65536] diff --git a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bljk_HHS_BH.yaml b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bljk_HHS_BH.yaml index cee421c358..7987e5b912 100644 --- a/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bljk_HHS_BH.yaml +++ b/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942/Equality/aquavanjaram_Cijk_Ailk_Bljk_HHS_BH.yaml @@ -8583,7 +8583,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 128 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -8594,10 +8594,10 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 19 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -8605,22 +8605,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x128_MI16x16x1_SN_GSU1_MIWT1_1_NTA2_NTB1_SU0_SUS0_WSGRA0_WSGRB1_WGM16 - LSCA: 64 - LSCB: 128 - LSPA: 32 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x16x64_MI16x16x1_SN_GSUM_MIWT2_1_NTA1_NTB0_SU4_SUS256_WSGRA1_WSGRB1_WGM1 + LSCA: 128 + LSCB: 64 + LSPA: 4 LSPB: 4 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 4 + LVPA: 1 LVPB: 1 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 27136 + LdsNumElements: 25920 LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedB: 1152 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 16384 @@ -8629,15 +8629,15 @@ LdsOffsetMetadata: 8448 LdsOffsetMetadata_Blk: 24832 LdsPadA: 16 - LdsPadB: 16 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: false MIBlock: [16, 16, 16, 1, 1, 1] @@ -8648,13 +8648,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 16 MagicDivAlg: 2 MatrixInstB: 1 @@ -8671,14 +8671,14 @@ NoReject: false NoTailLoop: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 + NonTemporalA: 1 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 NumLoadsB: 1 @@ -8693,7 +8693,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 3 PreloadKernArgs: false ProblemType: Activation: false @@ -8768,24 +8768,24 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 37 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x128_MI16x16x1_SN_GSU1_MIWT1_1_NTA2_NTB1_SU0_SUS0_WSGRA0_WSGRB1_WGM16 - SourceSwap: 0 - StaggerU: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x16x64_MI16x16x1_SN_GSU19_MIWT2_1_NTA1_NTB0_SU4_SUS256_WSGRA1_WSGRB1_WGM1 + SourceSwap: 1 + StaggerU: 4 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 1 - StoreRemapVectorWidth: 4 + StoreRemapVectorWidth: 0 StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 8 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -8797,27 +8797,27 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadA: 1 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: null + WorkspaceCheck: [76, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 0 - - 1LDSBuffer: 1 + _WorkspaceSizePerElemC: 76 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true @@ -8832,7 +8832,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 16 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -8843,8 +8843,8 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 14 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 8 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 1 GroupLoadStore: false @@ -8854,88 +8854,88 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x32x16_MI32x32x1_SN_GSUM_MIWT1_1_NTA0_NTB1_SU0_SUS0_WSGRA0_WSGRB0_WGM56 - LSCA: 128 - LSCB: 16 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT32x16x128_MI16x16x1_SN_GSUM_MIWT1_1_NT0_NTA0_NTB0_NTC0_NTD0_NTM0_SU0_SUS0_WSGRA1_WSGRB1_WGM110 + LSCA: 32 + LSCB: 128 LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSPB: 4 + LVCA: 4 + LVCB: 16 LVPA: 2 - LVPB: 16 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 8448 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 14848 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 2304 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsOffsetMetadata: 2688 - LdsOffsetMetadata_Blk: 6144 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 12544 + LdsOffsetMetadata: 4352 + LdsOffsetMetadata_Blk: 12544 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 16 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: false - MIBlock: [32, 32, 8, 1, 1, 1] + MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] + MIWaveGroup: [2, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false - NonTemporal: -1 + NonTemporal: 0 NonTemporalA: 0 - NonTemporalB: 1 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -9017,24 +9017,24 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 38 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x32x16_MI32x32x1_SN_GSU14_MIWT1_1_NTA0_NTB1_SU0_SUS0_WSGRA0_WSGRB0_WGM56 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT32x16x128_MI16x16x1_SN_GSU8_MIWT1_1_NT0_NTA0_NTB0_NTC0_NTD0_NTM0_SU0_SUS0_WSGRA1_WSGRB1_WGM110 SourceSwap: 0 StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 StorePriorityOpt: 1 - StoreRemapVectorWidth: 4 + StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -9048,31 +9048,31 @@ VectorStore: -1 VectorWidthA: 1 VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 56 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 110 WorkGroupReduction: false - WorkspaceCheck: [56, 0] - _DepthU: 16 - _DepthUA: 16 - _DepthUB: 16 - _DepthUMetadata: 16 + WorkspaceCheck: [32, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 56 + _WorkspaceSizePerElemC: 32 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 + AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true @@ -9081,7 +9081,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -9092,52 +9092,53 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 1 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 8 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x64_MI16x16x1_SN_GSUM_MIWT1_1_NTA0_NTB1_SU0_SUS0_WSGRA0_WSGRB1_WGM1 - LSCA: 64 - LSCB: 64 - LSPA: 32 - LSPB: 2 - LVCA: 8 - LVCB: 32 - LVPA: 4 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x128x128_MI16x16x1_SN_LDSB1_GSUM_LBSPPB256_LWPM0p40_MIWT1_4_NTB0_NEPBS0_SPO1_SSO2_WSGRB1_WG16_8_1 + LSCA: 16 + LSCB: 128 + LSPA: 2 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 13632 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 19968 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 12544 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 12544 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 35328 + LdsOffsetMetadata: 19968 + LdsOffsetMetadata_Blk: 35328 LdsPadA: 16 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: -1 + LocalWritePerMfma: 0.4 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: false + MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 @@ -9145,15 +9146,15 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] + MIWaveGroup: [1, 2] + MIWaveTile: [1, 4] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 128 + MacroTileA: 16 + MacroTileB: 128 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -9170,32 +9171,33 @@ NoTailLoop: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 1 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 16 + NumThreads: 128 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 3 + PrefetchLocalRead: 1 PreloadKernArgs: false ProblemType: Activation: false ActivationComputeDataType: 0 + ActivationNoGuard: false ActivationType: none AllowNoFreeDims: false AssignedDerivedParameters: true @@ -9207,6 +9209,8 @@ ComplexConjugateB: false ComputeDataType: 0 DataType: 4 + DataTypeA: 4 + DataTypeB: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -9261,30 +9265,30 @@ UseInitialStridesCD: false UseScaleAB: false UseScaleAlphaVec: false - UseScaleDVec: false + UseScaleCD: false ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 39 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x64_MI16x16x1_SN_GSU4_MIWT1_1_NTA0_NTB1_SU0_SUS0_WSGRA0_WSGRB1_WGM1 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x128x128_MI16x16x1_SN_LDSB1_GSU8_LBSPPB256_LWPM0p40_MIWT1_4_NTB0_NEPBS0_SPO1_SSO2_WSGRB1_WG16_8_1 SourceSwap: 1 - StaggerU: 0 + StaggerU: 4 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 + StoreSyncOpt: 2 StoreVectorWidth: 1 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 1 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 1 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -9297,31 +9301,31 @@ VectorStore: -1 VectorWidthA: 1 VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [16, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: null + WorkspaceCheck: [32, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 16 + _WorkspaceSizePerElemC: 32 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true @@ -9330,7 +9334,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -9341,52 +9345,56 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 19 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x16x64_MI16x16x1_SN_GSUM_MIWT2_1_NTA1_NTB0_SU4_SUS256_WSGRA1_WSGRB1_WGM1 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSUM_LBSPPA2048_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SVW2_VWA2_WG64_4_1 LSCA: 128 - LSCB: 64 + LSCB: 128 LSPA: 4 LSPB: 4 LVCA: 16 LVCB: 16 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 25920 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 25856 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25856 + LdsOffsetMetadata_Blk: 49408 LdsPadA: 16 - LdsPadB: 4 + LdsPadB: 16 LdsPadMetadata: 0 - LocalReadVectorWidth: 4 + LocalReadVectorWidth: 8 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: false + MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 @@ -9395,14 +9403,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [2, 1] + MIWaveTile: [2, 4] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 16 + MacroTileB: 64 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -9418,21 +9426,21 @@ NoReject: false NoTailLoop: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -9440,11 +9448,12 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 3 + PrefetchLocalRead: 1 PreloadKernArgs: false ProblemType: Activation: false ActivationComputeDataType: 0 + ActivationNoGuard: false ActivationType: none AllowNoFreeDims: false AssignedDerivedParameters: true @@ -9456,6 +9465,9 @@ ComplexConjugateB: false ComputeDataType: 0 DataType: 4 + DataTypeA: 4 + DataTypeB: 4 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -9510,17 +9522,17 @@ UseInitialStridesCD: false UseScaleAB: false UseScaleAlphaVec: false - UseScaleDVec: false + UseScaleCD: false ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 40 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x16x64_MI16x16x1_SN_GSU19_MIWT2_1_NTA1_NTB0_SU4_SUS256_WSGRA1_WSGRB1_WGM1 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA2048_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SVW2_VWA2_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 @@ -9531,9 +9543,9 @@ SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 1 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 1 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -9547,30 +9559,30 @@ VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 1 - WaveSeparateGlobalReadB: 1 + WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [76, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [0, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 76 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true @@ -9579,7 +9591,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 128 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -9591,41 +9603,45 @@ GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT32x16x128_MI16x16x1_SN_GSUM_MIWT1_1_NT0_NTA0_NTB0_NTC0_NTD0_NTM0_SU0_SUS0_WSGRA1_WSGRB1_WGM110 - LSCA: 32 - LSCB: 128 - LSPA: 16 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x64_MI32x32x1_SN_LDSB1_GRVWA8_GRVWB8_GSUM_LBSPPA0_LBSPPB128_LPA0_LPB8_LRVW8_MIWT2_2_SVW2_VWA2_WG64_4_1 + LSCA: 128 + LSCB: 64 + LSPA: 4 LSPB: 4 - LVCA: 4 - LVCB: 16 - LVPA: 2 + LVCA: 16 + LVCB: 8 + LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 14848 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 2304 + LdsNumElements: 17408 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 12544 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 12544 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 @@ -9633,40 +9649,40 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 8 - LoopUnroll: 128 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: false - MIBlock: [16, 16, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [32, 32, 8, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] + MatrixInstK: 8 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 8, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 NoLdsWriteCode: false NoReject: false NoTailLoop: false - NonTemporal: 0 + NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -9674,15 +9690,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -9694,6 +9710,7 @@ ProblemType: Activation: false ActivationComputeDataType: 0 + ActivationNoGuard: false ActivationType: none AllowNoFreeDims: false AssignedDerivedParameters: true @@ -9705,6 +9722,9 @@ ComplexConjugateB: false ComputeDataType: 0 DataType: 4 + DataTypeA: 4 + DataTypeB: 4 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -9759,30 +9779,30 @@ UseInitialStridesCD: false UseScaleAB: false UseScaleAlphaVec: false - UseScaleDVec: false + UseScaleCD: false ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 41 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT32x16x128_MI16x16x1_SN_GSU8_MIWT1_1_NT0_NTA0_NTB0_NTC0_NTD0_NTM0_SU0_SUS0_WSGRA1_WSGRB1_WGM110 - SourceSwap: 0 - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x64_MI32x32x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA0_LBSPPB128_LPA0_LPB8_LRVW8_MIWT2_2_SVW2_VWA2_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -9793,33 +9813,33 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 1 - WaveSeparateGlobalReadB: 1 + WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 110 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [32, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + WorkspaceCheck: [0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 32 - _staggerStrideShift: 0 - - 1LDSBuffer: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true @@ -9828,7 +9848,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 128 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -9839,52 +9859,56 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 + GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_GSUM_MIWT1_1_NTA0_NTB0_SU0_SUS0_WSGRA1_WSGRB1_WGM1 - LSCA: 16 - LSCB: 128 - LSPA: 32 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB4_GSUM_LBSPPA2048_LBSPPB128_LPA16_LPB4_LRVW4_MIWT4_14_SVW4_VWA4_WG64_4_1 + LSCA: 256 + LSCB: 64 + LSPA: 2 LSPB: 4 - LVCA: 2 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 27648 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 8704 + LdsNumElements: 31872 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 15232 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 18944 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 18944 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 31872 + LdsOffsetMetadata_Blk: 49408 LdsPadA: 16 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: false + MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 @@ -9892,15 +9916,15 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -9923,14 +9947,14 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 4 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 8 + NumLoadsB: 14 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 14 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -9943,6 +9967,7 @@ ProblemType: Activation: false ActivationComputeDataType: 0 + ActivationNoGuard: false ActivationType: none AllowNoFreeDims: false AssignedDerivedParameters: true @@ -9954,6 +9979,9 @@ ComplexConjugateB: false ComputeDataType: 0 DataType: 4 + DataTypeA: 4 + DataTypeB: 4 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -10008,30 +10036,30 @@ UseInitialStridesCD: false UseScaleAB: false UseScaleAlphaVec: false - UseScaleDVec: false + UseScaleCD: false ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 42 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_GSU2_MIWT1_1_NTA0_NTB0_SU0_SUS0_WSGRA1_WSGRB1_WGM1 - SourceSwap: 0 - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB4_GSU1_LBSPPA2048_LBSPPB128_LPA16_LPB4_LRVW4_MIWT4_14_SVW4_VWA4_WG64_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -10042,33 +10070,33 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 1 - WaveSeparateGlobalReadB: 1 + WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [8, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + WorkspaceCheck: [0, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 8 - _staggerStrideShift: 0 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true @@ -10087,45 +10115,49 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: 14 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_GSU1_MIWT1_1_NTA1_NTB2_SU0_SUS0_WSGRA0_WSGRB1_WGM1 - LSCA: 16 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT32x16x128_MI16x16x1_SN_GSUM_LBSPPA256_LBSPPB256_MIWT1_1_NEPBS16_SPO1_SSO2_WG32_4_1 + LSCA: 32 LSCB: 128 - LSPA: 64 + LSPA: 2 LSPB: 4 LVCA: 4 LVCB: 16 - LVPA: 16 + LVPA: 1 LVPB: 1 LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 27904 - LdsNumElementsAlignedA: 2304 - LdsNumElementsAlignedB: 9216 + LdsNumElements: 14976 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 2176 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 2304 - LdsOffsetB_Blk: 18688 - LdsOffsetMetadata: 2304 - LdsOffsetMetadata_Blk: 18688 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 12800 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 12800 LdsPadA: 16 - LdsPadB: 16 + LdsPadB: 8 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -10133,7 +10165,7 @@ LoopIters: 8 LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: false + MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 @@ -10141,15 +10173,15 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] + MIWaveGroup: [2, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -10165,22 +10197,22 @@ NoReject: false NoTailLoop: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 16 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -10192,6 +10224,7 @@ ProblemType: Activation: false ActivationComputeDataType: 0 + ActivationNoGuard: false ActivationType: none AllowNoFreeDims: false AssignedDerivedParameters: true @@ -10203,6 +10236,9 @@ ComplexConjugateB: false ComputeDataType: 0 DataType: 4 + DataTypeA: 4 + DataTypeB: 4 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -10257,24 +10293,24 @@ UseInitialStridesCD: false UseScaleAB: false UseScaleAlphaVec: false - UseScaleDVec: false + UseScaleCD: false ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 43 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_GSU1_MIWT1_1_NTA1_NTB2_SU0_SUS0_WSGRA0_WSGRB1_WGM1 - SourceSwap: 0 - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT32x16x128_MI16x16x1_SN_GSU14_LBSPPA256_LBSPPB256_MIWT1_1_NEPBS16_SPO1_SSO2_WG32_4_1 + SourceSwap: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StoreSyncOpt: 2 + StoreVectorWidth: 1 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 4 @@ -10293,31 +10329,31 @@ VectorStore: -1 VectorWidthA: 1 VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [0, 0] + WorkspaceCheck: [56, 0] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: null + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 0 + _WorkspaceSizePerElemC: 56 _staggerStrideShift: 0 - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 + AssertFree1ElementMultiple: 8 + AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true BufferLoad: true @@ -10326,7 +10362,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -10338,52 +10374,56 @@ GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: 60 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true ISA: [9, 4, 2] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x64_MI32x32x1_SN_GSU1_MIWT2_2_NT0_NTA0_NTB0_NTC0_NTD0_NTM0_SU0_SUS0_WSGRA1_WSGRB0_WGM4 - LSCA: 128 - LSCB: 64 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT32x32x128_MI16x16x1_SN_GSUM_LBSPPA256_LBSPPB256_MIWT1_1_NEPBS0_SPO0_SSO0_WG32_8_1 + LSCA: 32 + LSCB: 128 LSPA: 4 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSPB: 4 + LVCA: 4 + LVCB: 16 LVPA: 1 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 17408 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 9216 + LdsNumElements: 25344 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4352 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 8 - LoopUnroll: 64 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: false - MIBlock: [32, 32, 8, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 @@ -10391,520 +10431,22 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] - MaxOccupancy: 40 - MaxVgprNumber: 256 - MinVgprNumber: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: 0 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 3 - PreloadKernArgs: false - ProblemType: - Activation: false - ActivationComputeDataType: 0 - ActivationType: none - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Fp16AltImpl: false - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StridedBatched: true - SupportUserArgs: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: false - UseE: false - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: false - UseScaleAlphaVec: false - UseScaleDVec: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 44 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x64_MI32x32x1_SN_GSU1_MIWT2_2_NT0_NTA0_NTB0_NTC0_NTD0_NTM0_SU0_SUS0_WSGRA1_WSGRB0_WGM4 - SourceSwap: 1 - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 1 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupReduction: false - WorkspaceCheck: [0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: null - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 0 - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: V3 - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x32_MI32x32x1_SN_GSU1_MIWT4_2_NTA1_NTB1_SU0_SUS0_WSGRA0_WSGRB1_WGM24 - LSCA: 256 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 4 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumElements: 12800 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 4608 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 - LdsOffsetMetadata: 12800 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: false - MIBlock: [32, 32, 8, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] - MaxOccupancy: 40 - MaxVgprNumber: 256 - MinVgprNumber: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: false - ProblemType: - Activation: false - ActivationComputeDataType: 0 - ActivationType: none - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Fp16AltImpl: false - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StridedBatched: true - SupportUserArgs: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: false - UseE: false - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: false - UseScaleAlphaVec: false - UseScaleDVec: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 45 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x32_MI32x32x1_SN_GSU1_MIWT4_2_NTA1_NTB1_SU0_SUS0_WSGRA0_WSGRB1_WGM24 - SourceSwap: 1 - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 1 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 24 - WorkGroupReduction: false - WorkspaceCheck: [0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: null - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 0 - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: V3 - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - ISA: [9, 4, 2] - InnerUnroll: 1 - InterleaveAlpha: 0 - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x32_MI32x32x1_SN_GSU1_MIWT4_4_NTA0_NTB1_SU0_SUS0_WSGRA1_WSGRB1_WGM1 - LSCA: 256 - LSCB: 32 - LSPA: 2 - LSPB: 8 - LVCA: 32 - LVCB: 8 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumElements: 12800 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 4608 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 - LdsOffsetMetadata: 12800 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: false - MIBlock: [32, 32, 8, 1, 1, 1] - MIInputPerThread: 4 - MIInputPerThreadA: 4 - MIInputPerThreadB: 4 - MIInputPerThreadMetadata: 4 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 @@ -10913,21 +10455,21 @@ NoTailLoop: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 1 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -10939,6 +10481,7 @@ ProblemType: Activation: false ActivationComputeDataType: 0 + ActivationNoGuard: false ActivationType: none AllowNoFreeDims: false AssignedDerivedParameters: true @@ -10950,6 +10493,9 @@ ComplexConjugateB: false ComputeDataType: 0 DataType: 4 + DataTypeA: 4 + DataTypeB: 4 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -11004,30 +10550,30 @@ UseInitialStridesCD: false UseScaleAB: false UseScaleAlphaVec: false - UseScaleDVec: false + UseScaleCD: false ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 46 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x32_MI32x32x1_SN_GSU1_MIWT4_4_NTA0_NTB1_SU0_SUS0_WSGRA1_WSGRB1_WGM1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT32x32x128_MI16x16x1_SN_GSU60_LBSPPA256_LBSPPB256_MIWT1_1_NEPBS0_SPO0_SSO0_WG32_8_1 SourceSwap: 1 - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerU: 4 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 - StoreVectorWidth: 4 - SubGroup0: 4 + StoreVectorWidth: 1 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -11038,32 +10584,32 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 1 - WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [64, 2, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: null + WorkspaceCheck: [240, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 0 + _WorkspaceSizePerElemC: 240 _staggerStrideShift: 0 - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -11084,8 +10630,8 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 24 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 1 GroupLoadStore: false @@ -11096,35 +10642,38 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x32x128_MI16x16x1_SN_LDSB0_GSUM_LBSPPB256_LWPM0p40_MIWT1_1_NTB1_NEPBS0_SPO0_SSO0_WSGRB1_WG16_8_1 - LSCA: 16 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x128_MI16x16x1_SN_LDSB0_GRVWB4_GSUM_LBSPPA512_LBSPPB256_MIWT1_1_NTA1_NEPBS0_NLCA1_SPO1_SSO0_SVW1_VWA1_WG64_4_1 + LSCA: 64 LSCB: 128 - LSPA: 2 - LSPB: 4 - LVCA: 2 - LVCB: 16 + LSPA: 4 + LSPB: 2 + LVCA: 8 + LVCB: 32 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 15104 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4352 + LdsNumElements: 27200 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 2176 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 25088 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 25088 LdsPadA: 16 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: 0.4 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 8 @@ -11138,15 +10687,15 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] + MIWaveGroup: [4, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -11162,8 +10711,8 @@ NoReject: false NoTailLoop: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 + NonTemporalA: 1 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 1 NonTemporalE: 0 @@ -11171,13 +10720,13 @@ NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -11203,6 +10752,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 4 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -11261,20 +10811,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 47 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x32x128_MI16x16x1_SN_LDSB0_GSU24_LBSPPB256_LWPM0p40_MIWT1_1_NTB1_NEPBS0_SPO0_SSO0_WSGRB1_WG16_8_1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x128_MI16x16x1_SN_LDSB0_GRVWB4_GSU1_LBSPPA512_LBSPPB256_MIWT1_1_NTA1_NEPBS0_NLCA1_SPO1_SSO0_SVW1_VWA1_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 4 @@ -11297,26 +10847,26 @@ WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [96, 0] + WorkspaceCheck: [0, 0] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 96 + _WorkspaceSizePerElemC: 0 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -11326,7 +10876,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 128 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -11337,10 +10887,10 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 62 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 8 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11349,39 +10899,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x16x128_MI16x16x1_SN_LDSB0_GSUM_LBSPPB256_LWPM0p40_MIWT1_1_NTB0_NEPBS16_SPO0_SSO2_WSGRB1_WG16_4_1 - LSCA: 16 - LSCB: 128 - LSPA: 1 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x16x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA2048_LBSPPB128_MIWT4_1_NTA1_NEPBS0_NLCA1_SPO0_SSO0_SVW4_VWA4_WG64_4_1 + LSCA: 256 + LSCB: 64 + LSPA: 4 LSPB: 4 - LVCA: 2 + LVCA: 32 LVCB: 16 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 12928 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 17792 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 1152 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17792 + LdsOffsetMetadata_Blk: 49408 LdsPadA: 16 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: 0.4 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] @@ -11391,14 +10944,14 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 256 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 256 MacroTileB: 16 MagicDivAlg: 2 MatrixInstB: 1 @@ -11415,22 +10968,22 @@ NoReject: false NoTailLoop: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 1 NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 8 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -11456,6 +11009,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 4 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -11514,25 +11068,25 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 48 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x16x128_MI16x16x1_SN_LDSB0_GSU62_LBSPPB256_LWPM0p40_MIWT1_1_NTB0_NEPBS16_SPO0_SSO2_WSGRB1_WG16_4_1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x16x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU8_LBSPPA2048_LBSPPB128_MIWT4_1_NTA1_NEPBS0_NLCA1_SPO0_SSO0_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSyncOpt: 2 - StoreVectorWidth: 1 - SubGroup0: 4 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -11544,32 +11098,32 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [248, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + WorkspaceCheck: [32, 0] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 248 - _staggerStrideShift: 0 + _WorkspaceSizePerElemC: 32 + _staggerStrideShift: 1 - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -11590,10 +11144,10 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11602,35 +11156,38 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x128x128_MI16x16x1_SN_LDSB1_GSUM_LBSPPB256_LWPM0p40_MIWT1_4_NTB0_NEPBS0_SPO1_SSO2_WSGRB1_WG16_8_1 - LSCA: 16 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x16x128_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA1024_LBSPPB256_MIWT2_1_NTA0_NEPBS0_NLCA1_SPO0_SSO0_SVW2_VWA2_WG64_4_1 + LSCA: 128 LSCB: 128 - LSPA: 2 - LSPB: 4 - LVCA: 2 - LVCB: 16 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 19968 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 17408 + LdsNumElements: 19072 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 2176 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 35328 - LdsOffsetMetadata: 19968 - LdsOffsetMetadata_Blk: 35328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 49664 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 19072 + LdsOffsetMetadata_Blk: 49664 LdsPadA: 16 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: 0.4 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 8 @@ -11644,15 +11201,15 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 4] - MIWaveTileA: 1 - MIWaveTileB: 4 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 128 - MacroTileA: 16 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -11675,15 +11232,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 16 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -11709,6 +11266,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 4 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -11767,26 +11325,26 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 49 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x128x128_MI16x16x1_SN_LDSB1_GSU8_LBSPPB256_LWPM0p40_MIWT1_4_NTB0_NEPBS0_SPO1_SSO2_WSGRB1_WG16_8_1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x16x128_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA1024_LBSPPB256_MIWT2_1_NTA0_NEPBS0_NLCA1_SPO0_SSO0_SVW2_VWA2_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSyncOpt: 2 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -11797,32 +11355,32 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [32, 0] + WorkspaceCheck: [0, 0] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 32 + _WorkspaceSizePerElemC: 0 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -11843,10 +11401,10 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11855,31 +11413,34 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_LDSB0_GSUM_LBSPPB256_LWPMn1_MIWT1_1_NTB1_NEPBS16_SPO1_SSO2_WSGRB1_WG16_16_1 - LSCA: 16 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x128_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA512_LBSPPB256_MIWT2_1_NTA0_NEPBS16_NLCA1_SPO1_SSO0_SVW2_VWA2_WG32_4_1 + LSCA: 64 LSCB: 128 - LSPA: 4 - LSPB: 4 - LVCA: 2 - LVCB: 16 + LSPA: 2 + LSPB: 2 + LVCA: 8 + LVCB: 32 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 27648 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 8704 + LdsNumElements: 10880 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 2176 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 16384 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 18944 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 18944 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 25088 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 10880 + LdsOffsetMetadata_Blk: 25088 LdsPadA: 16 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -11897,15 +11458,15 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -11922,21 +11483,21 @@ NoTailLoop: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 1 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -11962,6 +11523,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 4 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -12020,25 +11582,25 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 50 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_LDSB0_GSU8_LBSPPB256_LWPMn1_MIWT1_1_NTB1_NEPBS16_SPO1_SSO2_WSGRB1_WG16_16_1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x128_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA512_LBSPPB256_MIWT2_1_NTA0_NEPBS16_NLCA1_SPO1_SSO0_SVW2_VWA2_WG32_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSyncOpt: 2 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 8 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -12050,32 +11612,32 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [32, 0] + WorkspaceCheck: [0, 0] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 32 + _WorkspaceSizePerElemC: 0 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -12096,10 +11658,10 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 5 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -12108,31 +11670,34 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x16x64_MI16x16x1_SN_LDSB0_GSU1_LBSPPB128_LWPMn1_MIWT1_1_NTB0_NEPBS0_SPO0_SSO0_WSGRB0_WG16_4_1 - LSCA: 16 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x16x64_MI16x16x1_SN_LDSB1_GRVWB2_GSUM_LBSPPA2048_LBSPPB128_MIWT4_1_NTA0_NEPBS0_NLCA1_SPO0_SSO2_SVW4_VWA4_WG64_4_1 + LSCA: 256 LSCB: 64 - LSPA: 1 - LSPB: 8 - LVCA: 2 - LVCB: 8 + LSPA: 4 + LSPB: 2 + LVCA: 32 + LVCB: 32 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 6528 - LdsNumElementsAlignedA: 1280 + LdsNumElements: 17792 + LdsNumElementsAlignedA: 16640 LdsNumElementsAlignedB: 1152 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17792 + LdsOffsetMetadata_Blk: 49408 LdsPadA: 16 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -12150,14 +11715,14 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 1] + MIWaveTileA: 4 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 256 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 256 MacroTileB: 16 MagicDivAlg: 2 MatrixInstB: 1 @@ -12181,15 +11746,15 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumLoadsA: 8 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -12215,6 +11780,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 4 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -12273,25 +11839,25 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 51 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x16x64_MI16x16x1_SN_LDSB0_GSU1_LBSPPB128_LWPMn1_MIWT1_1_NTB0_NEPBS0_SPO0_SSO0_WSGRB0_WG16_4_1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x16x64_MI16x16x1_SN_LDSB1_GRVWB2_GSU5_LBSPPA2048_LBSPPB128_MIWT4_1_NTA0_NEPBS0_NLCA1_SPO0_SSO2_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 + StoreSyncOpt: 2 + StoreVectorWidth: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -12303,32 +11869,32 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [0, 0] + WorkspaceCheck: [20, 0] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: null + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 0 + _WorkspaceSizePerElemC: 20 _staggerStrideShift: 1 - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -12350,7 +11916,7 @@ GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 9 + GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 1 GroupLoadStore: false @@ -12361,10 +11927,10 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x128x128_MI16x16x1_SN_LDSB1_GSUM_LBSPPB256_LWPM0p40_MIWT1_4_NTB1_NEPBS16_SPO1_SSO2_WSGRB1_WG16_8_1 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_LDSB0_GRVWA8_GSUM_LBSPPB256_MIWT1_1_NTB1_NEPBS0_SPO0_SSO0_WG16_16_1 LSCA: 16 LSCB: 128 - LSPA: 2 + LSPA: 4 LSPB: 4 LVCA: 2 LVCB: 16 @@ -12374,22 +11940,25 @@ LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 19968 + LdsNumElements: 27648 LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 + LdsOffsetA_Blk: 16384 LdsOffsetB: 2560 - LdsOffsetB_Blk: 35328 - LdsOffsetMetadata: 19968 - LdsOffsetMetadata_Blk: 35328 + LdsOffsetB_Blk: 18944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 18944 LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: 0.4 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 8 @@ -12403,15 +11972,15 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 4] + MIWaveGroup: [1, 4] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 4 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 16 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 16 - MacroTileB: 128 + MacroTileB: 64 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -12433,16 +12002,16 @@ NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 16 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 16 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] @@ -12468,6 +12037,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 4 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -12526,26 +12096,26 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 52 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x128x128_MI16x16x1_SN_LDSB1_GSU9_LBSPPB256_LWPM0p40_MIWT1_4_NTB1_NEPBS16_SPO1_SSO2_WSGRB1_WG16_8_1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_LDSB0_GRVWA8_GSU1_LBSPPB256_MIWT1_1_NTB1_NEPBS0_SPO0_SSO0_WG16_16_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSyncOpt: 2 + StoreSyncOpt: 0 StoreVectorWidth: 1 SubGroup0: 4 - SubGroup1: 32 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 32 + SubGroupB: 64 SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 1 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -12562,26 +12132,26 @@ WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [36, 0] + WorkspaceCheck: [0, 0] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 36 + _WorkspaceSizePerElemC: 0 _staggerStrideShift: 0 - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -12591,7 +12161,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -12602,8 +12172,8 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 7 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 8 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 1 GroupLoadStore: false @@ -12614,39 +12184,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x64_MI16x16x1_SN_GRVWB4_GSUM_LBSPPA512_LPB4_LRVW4_LWPM0p45_MIWT1_1_NTA0_NEPBS0_SPO0_SSO0_SVW1_VWA1_WG64_4_1 - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_LDSB0_GRVWA8_GSUM_LBSPPB256_MIWT1_1_NTB1_NEPBS16_SPO1_SSO0_WG16_16_1 + LSCA: 16 + LSCB: 128 LSPA: 4 LSPB: 4 - LVCA: 8 + LVCA: 2 LVCB: 16 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 13632 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 27648 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 12544 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 12544 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 18944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 18944 LdsPadA: 16 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: 0.45 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] @@ -12656,15 +12229,15 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] + MIWaveGroup: [1, 4] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -12681,20 +12254,20 @@ NoTailLoop: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 + NonTemporalB: 1 NonTemporalC: 0 NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 16 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -12721,6 +12294,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 4 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -12779,20 +12353,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 53 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x64_MI16x16x1_SN_GRVWB4_GSU7_LBSPPA512_LPB4_LRVW4_LWPM0p45_MIWT1_1_NTA0_NEPBS0_SPO0_SSO0_SVW1_VWA1_WG64_4_1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_LDSB0_GRVWA8_GSU8_LBSPPB256_MIWT1_1_NTB1_NEPBS16_SPO1_SSO0_WG16_16_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 1 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 4 @@ -12815,26 +12389,26 @@ WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [28, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + WorkspaceCheck: [32, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 28 + _WorkspaceSizePerElemC: 32 _staggerStrideShift: 0 - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -12844,7 +12418,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -12855,7 +12429,7 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 1 @@ -12867,39 +12441,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x64_MI16x16x1_SN_GRVWB4_GSU1_LBSPPA512_LPB4_LRVW4_LWPMn1_MIWT1_1_NTA0_NEPBS0_SPO0_SSO0_SVW1_VWA1_WG64_4_1 - LSCA: 64 - LSCB: 64 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x128x128_MI16x16x1_SN_LDSB1_GRVWA8_GSUM_LBSPPB256_MIWT1_2_NTB1_NEPBS16_SPO0_SSO2_WG16_16_1 + LSCA: 16 + LSCB: 128 LSPA: 4 LSPB: 4 - LVCA: 8 + LVCA: 2 LVCB: 16 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 13632 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 19968 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 12544 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 12544 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 35328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 19968 + LdsOffsetMetadata_Blk: 35328 LdsPadA: 16 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] @@ -12909,15 +12486,15 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] + MIWaveGroup: [1, 4] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 128 + MacroTileA: 16 + MacroTileB: 128 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -12934,20 +12511,20 @@ NoTailLoop: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 + NonTemporalB: 1 NonTemporalC: 0 NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -12974,6 +12551,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 4 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -13032,26 +12610,26 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 54 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT64x16x64_MI16x16x1_SN_GRVWB4_GSU1_LBSPPA512_LPB4_LRVW4_LWPMn1_MIWT1_1_NTA0_NEPBS0_SPO0_SSO0_SVW1_VWA1_WG64_4_1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x128x128_MI16x16x1_SN_LDSB1_GRVWA8_GSU1_LBSPPB256_MIWT1_2_NTB1_NEPBS16_SPO0_SSO2_WG16_16_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUMapping: 2 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StoreSyncOpt: 2 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -13068,26 +12646,26 @@ WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupReduction: false WorkspaceCheck: [0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 0 _staggerStrideShift: 0 - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -13097,7 +12675,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 64 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -13111,7 +12689,7 @@ GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13120,65 +12698,68 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_GSU1_LWPM0p45_MIWT2_4_NTD1_SUS256_SPO0_SSO2_SVW2_VWA2_WSGRB0_WG128_2_1_WGM1 - LSCA: 256 - LSCB: 64 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_LDSB0_GRVWA8_GSUM_LBSPPB256_MIWT1_1_NTB0_NEPBS16_SPO1_SSO0_WG16_16_1 + LSCA: 16 + LSCB: 128 LSPA: 4 - LSPB: 32 - LVCA: 32 - LVCB: 8 + LSPB: 4 + LVCA: 2 + LVCB: 16 LVPA: 1 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 25600 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 9216 + LdsNumElements: 27648 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 - LdsOffsetMetadata: 25600 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 18944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 18944 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: 0.45 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 8 - LoopUnroll: 64 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 8, 1, 1, 1] + MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 @@ -13193,13 +12774,13 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 @@ -13227,6 +12808,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 4 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -13285,26 +12867,26 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 55 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_GSU1_LWPM0p45_MIWT2_4_NTD1_SUS256_SPO0_SSO2_SVW2_VWA2_WSGRB0_WG128_2_1_WGM1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x64x128_MI16x16x1_SN_LDSB0_GRVWA8_GSU1_LBSPPB256_MIWT1_1_NTB0_NEPBS16_SPO1_SSO0_WG16_16_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSyncOpt: 2 - StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -13315,32 +12897,32 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupReduction: false WorkspaceCheck: [0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 1 + _staggerStrideShift: 0 - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -13350,7 +12932,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 32 + DepthU: 128 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -13360,11 +12942,11 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13373,71 +12955,74 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x32_MI32x32x1_SN_GSU1_LWPM0p40_MIWT2_4_NTD1_SUS128_SPO0_SSO2_SVW2_VWA2_WSGRB1_WG128_2_1_WGM1 - LSCA: 256 - LSCB: 32 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x128x128_MI16x16x1_SN_LDSB1_GRVWA4_GSUM_LBSPPB256_MIWT1_2_NTB0_NEPBS0_SPO1_SSO0_WG16_16_1 + LSCA: 16 + LSCB: 128 LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 4 + LSPB: 4 + LVCA: 4 + LVCB: 16 LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 12800 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 4608 + LdsNumElements: 19968 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 - LdsOffsetMetadata: 12800 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 35328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 19968 + LdsOffsetMetadata_Blk: 35328 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: 0.4 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 32 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 8, 1, 1, 1] + MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 + MacroTile0: 16 MacroTile1: 128 - MacroTileA: 256 + MacroTileA: 16 MacroTileB: 128 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 NoLdsWriteCode: false NoReject: false - NoTailLoop: true + NoTailLoop: false NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -13445,15 +13030,15 @@ NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -13480,6 +13065,7 @@ DataType: 4 DataTypeA: 4 DataTypeB: 4 + DataTypeE: 4 DestDataType: 4 F32XdlMathOp: 0 Fp16AltImpl: false @@ -13538,26 +13124,26 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 56 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x32_MI32x32x1_SN_GSU1_LWPM0p40_MIWT2_4_NTD1_SUS128_SPO0_SSO2_SVW2_VWA2_WSGRB1_WG128_2_1_WGM1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT16x128x128_MI16x16x1_SN_LDSB1_GRVWA4_GSU3_LBSPPB256_MIWT1_2_NTB0_NEPBS0_SPO1_SSO0_WG16_16_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUMapping: 2 + StaggerUStride: 256 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSyncOpt: 2 - StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -13568,26 +13154,26 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 2 WaveSeparateGlobalReadB: 1 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: null + WorkspaceCheck: [12, 0] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 1 + _WorkspaceSizePerElemC: 12 + _staggerStrideShift: 0 - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false @@ -13626,12 +13212,12 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x64_MI32x32x1_SN_GRVWB8_GSU1_LBSPPA0_LPA0_LPB8_LWPMn1_MIWT2_2_NTD1_NEPBS0_SU4_SUS256_SPO0_SSO0 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_LDSB1_GRVWB8_GSUM_LBSPPA0_LPA0_LPB8_LRVW8_LWPMn1_MIWT2_4_SUM1_SVW2_VWA2_WG128_2_1 + LSCA: 256 LSCB: 64 - LSPA: 4 + LSPA: 2 LSPB: 4 - LVCA: 16 + LVCA: 32 LVCB: 8 LVPA: 1 LVPB: 1 @@ -13639,16 +13225,19 @@ LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 17408 - LdsNumElementsAlignedA: 8192 + LdsNumElements: 25600 + LdsNumElementsAlignedA: 16384 LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -13660,7 +13249,7 @@ LoopIters: 8 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 8, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 @@ -13668,14 +13257,14 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 256 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 256 MacroTileB: 128 MagicDivAlg: 2 MatrixInstB: 1 @@ -13695,17 +13284,17 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 @@ -13792,26 +13381,26 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 57 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x64_MI32x32x1_SN_GRVWB8_GSU1_LBSPPA0_LPA0_LPB8_LWPMn1_MIWT2_2_NTD1_NEPBS0_SU4_SUS256_SPO0_SSO0 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_LDSB1_GRVWB8_GSU1_LBSPPA0_LPA0_LPB8_LRVW8_LWPMn1_MIWT2_4_SUM1_SVW2_VWA2_WG128_2_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 1 StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 32 - ThreadTileB: 2 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -13824,11 +13413,11 @@ VectorStore: -1 VectorWidthA: 2 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 + WaveSeparateGlobalReadA: 1 WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 WorkGroupReduction: false WorkspaceCheck: [0, 0] @@ -13847,7 +13436,7 @@ ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -13869,7 +13458,7 @@ GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 GlobalReadVectorWidthB: 8 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 2 GroupLoadStore: false @@ -13880,12 +13469,12 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_GSUM_LWPM0p40_MIAV0_MIWT2_4_NEPBS0_SUS256_SPO1_SSO0_WSGRB1_WGM32 - LSCA: 256 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x64_MI32x32x1_SN_LDSB1_GRVWB8_GSUM_LBSPPA0_LPA0_LPB8_LRVW8_LWPMn1_MIWT2_2_SUM2_SVW2_VWA2_WG64_4_1 + LSCA: 128 LSCB: 64 LSPA: 4 - LSPB: 8 - LVCA: 32 + LSPB: 4 + LVCA: 16 LVCB: 8 LVPA: 1 LVPB: 1 @@ -13893,22 +13482,25 @@ LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 25600 - LdsNumElementsAlignedA: 16384 + LdsNumElements: 17408 + LdsNumElementsAlignedA: 8192 LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 - LdsOffsetMetadata: 25600 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 8 LocalSplitU: 1 - LocalWritePerMfma: 0.4 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 8 @@ -13922,14 +13514,14 @@ MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 256 + MacroTileA: 128 MacroTileB: 128 MagicDivAlg: 2 MatrixInstB: 1 @@ -13949,17 +13541,17 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 @@ -14046,26 +13638,26 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 58 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_GSU2_LWPM0p40_MIAV0_MIWT2_4_NEPBS0_SUS256_SPO1_SSO0_WSGRB1_WGM32 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x64_MI32x32x1_SN_LDSB1_GRVWB8_GSU1_LBSPPA0_LPA0_LPB8_LRVW8_LWPMn1_MIWT2_2_SUM2_SVW2_VWA2_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 32 - ThreadTileB: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -14078,30 +13670,30 @@ VectorStore: -1 VectorWidthA: 2 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 32 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [8, 0] + WorkspaceCheck: [0, 0] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer + _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 8 + _WorkspaceSizePerElemC: 0 _staggerStrideShift: 1 - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -14122,10 +13714,10 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 2 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14134,42 +13726,45 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_GSU1_LWPM0p40_MIAV0_MIWT2_4_NEPBS0_SUS512_SPO1_SSO2_WSGRB0_WGM32 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x112x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_7_SUM1_SVW4_VWA4_WG64_4_1 LSCA: 256 LSCB: 64 - LSPA: 4 - LSPB: 32 + LSPA: 2 + LSPB: 4 LVCA: 32 - LVCB: 8 + LVCB: 16 LVPA: 1 - LVPB: 4 - LdsBlockSizePerPadA: 0 + LVPB: 1 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 25600 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 9216 + LdsNumElements: 24320 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 7680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 - LdsOffsetMetadata: 25600 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 8 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24320 + LdsOffsetMetadata_Blk: 49408 + LdsPadA: 16 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: 0.4 + LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 + LoopIters: 4 LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 8, 1, 1, 1] + MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 @@ -14177,22 +13772,22 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 128 + MacroTile1: 112 MacroTileA: 256 - MacroTileB: 128 + MacroTileB: 112 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 @@ -14203,18 +13798,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 7 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -14300,26 +13895,26 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 59 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_GSU1_LWPM0p40_MIAV0_MIWT2_4_NEPBS0_SUS512_SPO1_SSO2_WSGRB0_WGM32 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x112x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU2_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_7_SUM1_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUMapping: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSyncOpt: 2 - StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -14330,32 +13925,32 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 32 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 WorkGroupReduction: false - WorkspaceCheck: [0, 0] + WorkspaceCheck: [8, 0] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: null + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 2 + _WorkspaceSizePerElemC: 8 + _staggerStrideShift: 1 - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -14365,7 +13960,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -14375,8 +13970,8 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 4 @@ -14388,39 +13983,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x240x32_MI16x16x1_SN_GRVWA2_GSU1_LWPMn1_MIWT4_15_NEPBS0_NLCA2_SUS256_SPO0_SSO0_WSGRB1_WGM32 - LSCA: 128 - LSCB: 32 - LSPA: 4 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x112x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_7_SUM1_SVW4_VWA4_WG64_4_1 + LSCA: 256 + LSCB: 64 + LSPA: 2 LSPB: 4 - LVCA: 64 + LVCA: 32 LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 4096 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 17920 - LdsNumElementsAlignedA: 8320 - LdsNumElementsAlignedB: 9600 + LdsNumElements: 24320 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 7680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8320 - LdsOffsetB_Blk: 41088 - LdsOffsetMetadata: 17920 - LdsOffsetMetadata_Blk: 41088 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24320 + LdsOffsetMetadata_Blk: 49408 LdsPadA: 16 - LdsPadB: 16 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] @@ -14431,14 +14029,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [4, 15] + MIWaveTile: [4, 7] MIWaveTileA: 4 - MIWaveTileB: 15 + MIWaveTileB: 7 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 240 + MacroTile1: 112 MacroTileA: 256 - MacroTileB: 240 + MacroTileB: 112 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -14452,23 +14050,23 @@ MinVgprNumber: 0 NoLdsWriteCode: false NoReject: false - NoTailLoop: true + NoTailLoop: false NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 16 - NumLoadsB: 15 - NumLoadsCoalescedA: 2 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 15 + NumLoadsPerpendicularB: 7 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -14554,11 +14152,11 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 60 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x240x32_MI16x16x1_SN_GRVWA2_GSU1_LWPMn1_MIWT4_15_NEPBS0_NLCA2_SUS256_SPO0_SSO0_WSGRB1_WGM32 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x112x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_7_SUM1_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 1 StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 @@ -14571,9 +14169,9 @@ SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 15 + ThreadTile1: 7 ThreadTileA: 16 - ThreadTileB: 15 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -14586,30 +14184,30 @@ VectorStore: -1 VectorWidthA: 4 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 + WorkGroupMapping: 1 WorkGroupReduction: false WorkspaceCheck: [0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 2 + _staggerStrideShift: 1 - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -14630,10 +14228,10 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 + GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14642,42 +14240,45 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_GSU1_LWPMn1_MIAV0_MIWT2_4_NEPBS0_SUS512_SPO1_SSO0_WSGRB1_WGM1 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA2048_LPA16_LPB4_LRVW4_LWPM0p40_MIWT4_14_SUM1_SVW4_VWA4_WG64_4_1 LSCA: 256 LSCB: 64 - LSPA: 4 - LSPB: 8 + LSPA: 2 + LSPB: 4 LVCA: 32 - LVCB: 8 + LVCB: 16 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 25600 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 9216 + LdsNumElements: 31872 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 15232 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 - LdsOffsetMetadata: 25600 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 8 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 31872 + LdsOffsetMetadata_Blk: 49408 + LdsPadA: 16 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: -1 + LocalWritePerMfma: 0.4 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 + LoopIters: 4 LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 8, 1, 1, 1] + MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 @@ -14685,22 +14286,22 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTile: [4, 14] + MIWaveTileA: 4 + MIWaveTileB: 14 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 128 + MacroTile1: 224 MacroTileA: 256 - MacroTileB: 128 + MacroTileB: 224 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 @@ -14711,18 +14312,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 14 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 14 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -14808,26 +14409,26 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 61 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x64_MI32x32x1_SN_GSU1_LWPMn1_MIAV0_MIWT2_4_NEPBS0_SUS512_SPO1_SSO0_WSGRB1_WGM1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA2048_LPA16_LPB4_LRVW4_LWPM0p40_MIWT4_14_SUM1_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUMapping: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 14 + ThreadTileA: 16 + ThreadTileB: 14 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -14838,13 +14439,13 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupReduction: false WorkspaceCheck: [0, 0] @@ -14857,13 +14458,13 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 2 + _staggerStrideShift: 1 - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true AssertFree0ElementMultiple: 8 - AssertFree1ElementMultiple: 1 + AssertFree1ElementMultiple: 8 AssertSummationElementMultiple: 32 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -14884,7 +14485,7 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalWriteVectorWidth: 4 @@ -14896,33 +14497,36 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x144x64_MI16x16x1_SN_LDSB1_GRVWB2_GSU1_LWPMn1_MIAV0_MIWT4_9_NEPBS0_SUS256_SPO0_SSO0_WSGRB1_WGM1 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_14_SUM2_SVW4_VWA4_WG64_4_1 LSCA: 256 LSCB: 64 - LSPA: 4 - LSPB: 2 + LSPA: 2 + LSPB: 4 LVCA: 32 - LVCB: 32 + LVCB: 16 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 28032 - LdsNumElementsAlignedA: 16512 - LdsNumElementsAlignedB: 11520 + LdsNumElements: 31872 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 15232 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16512 - LdsOffsetB_Blk: 49280 - LdsOffsetMetadata: 28032 - LdsOffsetMetadata_Blk: 49280 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 31872 + LdsOffsetMetadata_Blk: 49408 LdsPadA: 16 - LdsPadB: 16 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -14939,14 +14543,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [4, 9] + MIWaveTile: [4, 14] MIWaveTileA: 4 - MIWaveTileB: 9 + MIWaveTileB: 14 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 144 + MacroTile1: 224 MacroTileA: 256 - MacroTileB: 144 + MacroTileB: 224 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -14965,18 +14569,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 36 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 NumLoadsA: 8 - NumLoadsB: 18 + NumLoadsB: 14 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 18 + NumLoadsPerpendicularB: 14 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -15062,11 +14666,11 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 62 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x144x64_MI16x16x1_SN_LDSB1_GRVWB2_GSU1_LWPMn1_MIAV0_MIWT4_9_NEPBS0_SUS256_SPO0_SSO0_WSGRB1_WGM1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_14_SUM2_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 - StaggerUMapping: 0 + StaggerUMapping: 2 StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 @@ -15079,9 +14683,9 @@ SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 9 + ThreadTile1: 14 ThreadTileA: 16 - ThreadTileB: 9 + ThreadTileB: 14 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -15094,8 +14698,8 @@ VectorStore: -1 VectorWidthA: 4 VectorWidthB: 1 - WaveSeparateGlobalReadA: 2 - WaveSeparateGlobalReadB: 1 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 2 WaveSeparateGlobalReadMetadata: 0 WavefrontSize: 64 WorkGroup: [64, 4, 1] @@ -15127,7 +14731,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 128 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -15138,10 +14742,10 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 + GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -15150,42 +14754,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSUM_LBSPPA2048_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SVW2_VWA2_WG64_4_1 - LSCA: 128 - LSCB: 128 - LSPA: 4 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x32_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA2048_LPA16_LPB4_LRVW4_LWPM0p40_MIWT4_8_SUM2_SVW4_VWA4_WG64_4_1 + LSCA: 256 + LSCB: 32 + LSPA: 2 LSPB: 4 - LVCA: 16 - LVCB: 16 + LVCA: 32 + LVCB: 8 LVPA: 1 LVPB: 1 LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 25856 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 9216 + LdsNumElements: 12672 + LdsNumElementsAlignedA: 8320 + LdsNumElementsAlignedB: 4352 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 49408 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8320 + LdsOffsetB_Blk: 24704 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25856 - LdsOffsetMetadata_Blk: 49408 + LdsOffsetMetadata: 12672 + LdsOffsetMetadata_Blk: 24704 LdsPadA: 16 - LdsPadB: 16 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 - LocalWritePerMfma: -1 + LocalWritePerMfma: 0.4 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] @@ -15196,14 +14800,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -15217,7 +14821,7 @@ MinVgprNumber: 0 NoLdsWriteCode: false NoReject: false - NoTailLoop: false + NoTailLoop: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -15226,13 +14830,13 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 @@ -15319,8 +14923,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 63 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x64x128_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA2048_LBSPPB256_LPA16_LPB16_LRVW8_MIWT2_4_SVW2_VWA2_WG64_4_1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x32_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA2048_LPA16_LPB4_LRVW4_LWPM0p40_MIWT4_8_SUM2_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 StaggerUMapping: 2 @@ -15328,17 +14932,17 @@ StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -15349,7 +14953,7 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 1 WaveSeparateGlobalReadB: 2 @@ -15359,16 +14963,16 @@ WorkGroupMapping: 1 WorkGroupReduction: false WorkspaceCheck: [0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 0 + _staggerStrideShift: 2 - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false @@ -15395,10 +14999,10 @@ ExpandPointerSwap: 0 GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 + GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -15407,68 +15011,68 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x64_MI32x32x1_SN_LDSB1_GRVWA8_GRVWB8_GSUM_LBSPPA0_LBSPPB128_LPA0_LPB8_LRVW8_MIWT2_2_SVW2_VWA2_WG64_4_1 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x112x64_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_7_SUM2_SVW4_VWA4_WG64_4_1 + LSCA: 256 LSCB: 64 - LSPA: 4 + LSPA: 2 LSPB: 4 - LVCA: 16 - LVCB: 8 + LVCA: 32 + LVCB: 16 LVPA: 1 LVPB: 1 - LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 17408 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 9216 + LdsNumElements: 24320 + LdsNumElementsAlignedA: 16640 + LdsNumElementsAlignedB: 7680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetB: 16640 + LdsOffsetB_Blk: 49408 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 - LdsPadB: 8 + LdsOffsetMetadata: 24320 + LdsOffsetMetadata_Blk: 49408 + LdsPadA: 16 + LdsPadB: 4 LdsPadMetadata: 0 - LocalReadVectorWidth: 8 + LocalReadVectorWidth: 4 LocalSplitU: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 + LoopIters: 4 LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 8, 1, 1, 1] + MIBlock: [16, 16, 16, 1, 1, 1] MIInputPerThread: 4 MIInputPerThreadA: 4 MIInputPerThreadB: 4 MIInputPerThreadMetadata: 4 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 7] + MIWaveTileA: 4 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 112 + MacroTileA: 256 + MacroTileB: 112 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 8 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 8, 1] + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 @@ -15483,14 +15087,14 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 112 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 8 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -15576,8 +15180,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 64 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x64_MI32x32x1_SN_LDSB1_GRVWA8_GRVWB8_GSU1_LBSPPA0_LBSPPB128_LPA0_LPB8_LRVW8_MIWT2_2_SVW2_VWA2_WG64_4_1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x112x64_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_7_SUM2_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 StaggerUMapping: 2 @@ -15585,17 +15189,17 @@ StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSyncOpt: 0 - StoreVectorWidth: 2 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 7 + ThreadTileA: 16 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -15606,7 +15210,7 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 1 WaveSeparateGlobalReadB: 2 @@ -15641,7 +15245,7 @@ ClusterLocalRead: 1 CodeObjectVersion: V3 CustomKernelName: '' - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -15664,32 +15268,32 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB4_GSUM_LBSPPA2048_LBSPPB128_LPA16_LPB4_LRVW4_MIWT4_14_SVW4_VWA4_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x32_MI16x16x1_SN_LDSB1_GRVWB4_GSUM_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_8_SUM2_SVW4_VWA4_WG64_4_1 LSCA: 256 - LSCB: 64 + LSCB: 32 LSPA: 2 LSPB: 4 LVCA: 32 - LVCB: 16 + LVCB: 8 LVPA: 1 LVPB: 1 LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 LdsInitCVgprs: false - LdsNumElements: 31872 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 15232 + LdsNumElements: 12672 + LdsNumElementsAlignedA: 8320 + LdsNumElementsAlignedB: 4352 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 49408 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8320 + LdsOffsetB_Blk: 24704 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 31872 - LdsOffsetMetadata_Blk: 49408 + LdsOffsetMetadata: 12672 + LdsOffsetMetadata_Blk: 24704 LdsPadA: 16 LdsPadB: 4 LdsPadMetadata: 0 @@ -15698,8 +15302,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 16, 1, 1, 1] @@ -15710,14 +15314,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [4, 14] + MIWaveTile: [4, 8] MIWaveTileA: 4 - MIWaveTileB: 14 + MIWaveTileB: 8 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 224 + MacroTile1: 128 MacroTileA: 256 - MacroTileB: 224 + MacroTileB: 128 MagicDivAlg: 2 MatrixInstB: 1 MatrixInstBM: 1 @@ -15731,7 +15335,7 @@ MinVgprNumber: 0 NoLdsWriteCode: false NoReject: false - NoTailLoop: false + NoTailLoop: true NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 @@ -15740,14 +15344,14 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 8 - NumLoadsB: 14 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -15833,8 +15437,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 65 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x224x64_MI16x16x1_SN_LDSB1_GRVWA8_GRVWB4_GSU1_LBSPPA2048_LBSPPB128_LPA16_LPB4_LRVW4_MIWT4_14_SVW4_VWA4_WG64_4_1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT256x128x32_MI16x16x1_SN_LDSB1_GRVWB4_GSU1_LBSPPA2048_LPA16_LPB4_LRVW4_LWPMn1_MIWT4_8_SUM2_SVW4_VWA4_WG64_4_1 SourceSwap: 1 StaggerU: 4 StaggerUMapping: 2 @@ -15850,9 +15454,9 @@ SuppressNoLoadLoop: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 14 + ThreadTile1: 8 ThreadTileA: 16 - ThreadTileB: 14 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true UnrollMajorLDSA: false @@ -15873,16 +15477,16 @@ WorkGroupMapping: 1 WorkGroupReduction: false WorkspaceCheck: [0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 1 + _staggerStrideShift: 2 - [2, 3, 0, 1] - - - [1104, 1, 1, 4608] - [0, 675.057] @@ -15965,69 +15569,69 @@ - - [9216, 1408, 1, 768] - [36, 368290.0] - - [16, 16, 1, 1024] - - [41, 0.0] + - [38, 0.0] - - [16, 16, 1, 8192] - - [47, 0.0] + - [43, 0.0] - - [16, 16, 1, 65536] - - [48, 0.0] + - [44, 0.0] - - [16, 2048, 1, 1024] - - [42, 0.0] + - [50, 0.0] - - [16, 2048, 1, 8192] - - [49, 0.0] + - [39, 0.0] - - [16, 2048, 1, 65536] - - [50, 0.0] + - [51, 0.0] - - [16, 8192, 1, 1024] - - [43, 0.0] + - [52, 0.0] - - [16, 8192, 1, 8192] - - [51, 0.0] + - [53, 0.0] - - [16, 8192, 1, 65536] - - [52, 0.0] + - [54, 0.0] - - [2048, 16, 1, 1024] - - [39, 0.0] + - [45, 0.0] - - [2048, 16, 1, 8192] - - [53, 0.0] + - [46, 0.0] - - [2048, 16, 1, 65536] - - [40, 0.0] + - [37, 0.0] - - [2048, 2048, 1, 1024] - - [57, 0.0] + - [55, 0.0] - - [2048, 2048, 1, 8192] - - [44, 0.0] + - [56, 0.0] - - [2048, 2048, 1, 65536] - - [58, 0.0] + - [57, 0.0] - - [2048, 8192, 1, 1024] - - [45, 0.0] + - [58, 0.0] - - [2048, 8192, 1, 8192] - [59, 0.0] - - [2048, 8192, 1, 65536] - [60, 0.0] - - [8192, 16, 1, 1024] - - [37, 0.0] + - [47, 0.0] - - [8192, 16, 1, 8192] - - [54, 0.0] + - [48, 0.0] - - [8192, 16, 1, 65536] - - [38, 0.0] + - [49, 0.0] - - [8192, 2048, 1, 1024] - [61, 0.0] - - [8192, 2048, 1, 8192] - - [55, 0.0] + - [62, 0.0] - - [8192, 2048, 1, 65536] - - [46, 0.0] + - [62, 0.0] - - [8192, 8192, 1, 1024] - - [56, 0.0] + - [63, 0.0] - - [8192, 8192, 1, 8192] - - [65, 0.0] + - [60, 0.0] - - [8192, 8192, 1, 65536] - - [62, 0.0] + - [60, 0.0] - - [512, 512, 1, 512] - - [63, 0.0] + - [40, 0.0] - - [1024, 1024, 1, 1024] - - [64, 0.0] + - [41, 0.0] - - [2048, 2048, 1, 2048] - - [64, 0.0] + - [41, 0.0] - - [4096, 4096, 1, 4096] - - [65, 0.0] + - [42, 0.0] - - [4096, 4096, 1, 8192] - - [65, 0.0] + - [42, 0.0] - null - null - DeviceEfficiency